diff --git a/.gitignore b/.gitignore index 2d7730711..f4fb46f2e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,8 @@ *.iws *~ .vscode +.metals +.bloop .classpath /*/.classpath /*/*/.classpath @@ -24,4 +26,5 @@ spark-warehouse /**/job-override.properties /**/*.log +/**/.factorypath diff --git a/100.patch b/100.patch new file mode 100644 index 000000000..f28cdd0a5 --- /dev/null +++ b/100.patch @@ -0,0 +1,757 @@ +From c5fbad8093ca27deebf1b5fd5ffd39e1877c533d Mon Sep 17 00:00:00 2001 +From: antleb +Date: Thu, 4 Mar 2021 00:42:21 +0200 +Subject: [PATCH 1/8] Contexts are now downloaded instead of using the + stats_ext db + +--- + .../dhp/oa/graph/stats/oozie_app/contexts.sh | 33 +++++++++++++++++++ + .../graph/stats/oozie_app/scripts/step10.sql | 13 -------- + .../dhp/oa/graph/stats/oozie_app/workflow.xml | 17 ++++++++++ + 3 files changed, 50 insertions(+), 13 deletions(-) + create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh + +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +new file mode 100644 +index 00000000..f06a43bb +--- /dev/null ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +@@ -0,0 +1,33 @@ ++#!/usr/bin/env bash ++ ++CONTEXT_API=$1 ++TARGET_DB=$2 ++ ++TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 contexts.csv ++cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv ++cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv ++cat contexts.csv | cut -f1 -d, | sed 's/\(.*\)/\1,\1::other,other/' >> categories.csv ++cat categories.csv | cut -d, -f2 | sed 's/\(.*\)/\1,\1::other,other/' >> concepts.csv ++ ++echo "uploading context data to hdfs" ++hdfs dfs -mkdir ${TMP} ++hdfs dfs -copyFromLocal contexts.csv ${TMP} ++hdfs dfs -copyFromLocal categories.csv ${TMP} ++hdfs dfs -copyFromLocal concepts.csv ${TMP} ++hdfs dfs -chmod -R 777 ${TMP} ++ ++echo "Creating and populating impala tables" ++impala-shell -c "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ',';" ++impala-shell -c "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ',';" ++impala-shell -c "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ',';" ++impala-shell -c "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context;" ++impala-shell -c "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category;" ++impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept;" ++ ++echo "Cleaning up" ++hdfs dfs -rm -f -r -skipTrash ${TMP} ++ ++echo "Finito!" +\ No newline at end of file +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +index 6c96317e..77fbd3b1 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +@@ -23,19 +23,6 @@ CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS + SELECT * + FROM ${external_stats_db_name}.rndexpediture; + +-CREATE OR REPLACE VIEW ${stats_db_name}.context AS +-SELECT * +-FROM ${external_stats_db_name}.context; +- +-CREATE OR REPLACE VIEW ${stats_db_name}.category AS +-SELECT * +-FROM ${external_stats_db_name}.category; +- +-CREATE OR REPLACE VIEW ${stats_db_name}.concept AS +-SELECT * +-FROM ${external_stats_db_name}.concept; +- +- + ------------------------------------------------------------------------------------------------ + ------------------------------------------------------------------------------------------------ + -- Creation date of the database +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +index 9c16f149..afb10c41 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +@@ -41,6 +41,10 @@ + hive_timeout + the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds. + ++ ++ context_api_url ++ the base url of the context api (https://services.openaire.eu/openaire) ++ + + + +@@ -263,6 +267,19 @@ + + + ++ ++ ++ ++ ${jobTracker} ++ ${nameNode} ++ contexts.sh ++ ${context_api_url} ++ ${stats_db_name} ++ contexts.sh ++ ++ ++ ++ + + + +-- +2.17.1 + + +From 6147ee495053634436abe822aaf9ba909813d8c4 Mon Sep 17 00:00:00 2001 +From: antleb +Date: Fri, 5 Mar 2021 14:12:18 +0200 +Subject: [PATCH 2/8] assigning correctly hive contexts to concepts + +--- + .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh | 7 +++++-- + .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql | 5 ++++- + .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql | 5 ++++- + .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql | 5 ++++- + .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql | 5 ++++- + 5 files changed, 21 insertions(+), 6 deletions(-) + +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +index f06a43bb..6788f88b 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +@@ -9,8 +9,8 @@ echo "Downloading context data" + curl ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv + cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv + cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv +-cat contexts.csv | cut -f1 -d, | sed 's/\(.*\)/\1,\1::other,other/' >> categories.csv +-cat categories.csv | cut -d, -f2 | sed 's/\(.*\)/\1,\1::other,other/' >> concepts.csv ++cat contexts.csv | sed 's/^\(.*\),\(.*\)/\1,\1::other,\2/' >> categories.csv ++cat categories.csv | grep -v ::other | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv + + echo "uploading context data to hdfs" + hdfs dfs -mkdir ${TMP} +@@ -29,5 +29,8 @@ impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}. + + echo "Cleaning up" + hdfs dfs -rm -f -r -skipTrash ${TMP} ++rm concepts.csv ++rm categories.csv ++rm contexts.csv + + echo "Finito!" +\ No newline at end of file +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +index 62a15856..75b24b18 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +@@ -47,7 +47,10 @@ from ${openaire_db_name}.publication p + where p.datainfo.deletedbyinference = false; + + CREATE TABLE ${stats_db_name}.publication_concepts AS +-SELECT substr(p.id, 4) as id, contexts.context.id as concept ++SELECT substr(p.id, 4) as id, case ++ when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id ++ when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') ++ when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept + from ${openaire_db_name}.publication p + LATERAL VIEW explode(p.context) contexts as context + where p.datainfo.deletedbyinference = false; +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +index dcd5ad85..540cc03a 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +@@ -54,7 +54,10 @@ FROM ${openaire_db_name}.dataset p + where p.datainfo.deletedbyinference = false; + + CREATE TABLE ${stats_db_name}.dataset_concepts AS +-SELECT substr(p.id, 4) as id, contexts.context.id as concept ++SELECT substr(p.id, 4) as id, case ++ when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id ++ when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') ++ when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept + from ${openaire_db_name}.dataset p + LATERAL VIEW explode(p.context) contexts as context + where p.datainfo.deletedbyinference = false; +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +index fd5390e6..54345e07 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +@@ -54,7 +54,10 @@ FROM ${openaire_db_name}.software p + where p.datainfo.deletedbyinference = false; + + CREATE TABLE ${stats_db_name}.software_concepts AS +-SELECT substr(p.id, 4) AS id, contexts.context.id AS concept ++SELECT substr(p.id, 4) as id, case ++ when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id ++ when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') ++ when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept + FROM ${openaire_db_name}.software p + LATERAL VIEW explode(p.context) contexts AS context + where p.datainfo.deletedbyinference = false; +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +index b359b596..36ad5d92 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +@@ -52,7 +52,10 @@ FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance. + where p.datainfo.deletedbyinference = false; + + CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS +-SELECT substr(p.id, 4) AS id, contexts.context.id AS concept ++SELECT substr(p.id, 4) as id, case ++ when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id ++ when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') ++ when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept + FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context + where p.datainfo.deletedbyinference = false; + +-- +2.17.1 + + +From f40c150a0d549e2dbcfd42ecf81e17ad4b505391 Mon Sep 17 00:00:00 2001 +From: antleb +Date: Sat, 6 Mar 2021 00:35:57 +0200 +Subject: [PATCH 3/8] fixed steps... + +--- + .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +index afb10c41..2184cb8a 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +@@ -264,7 +264,7 @@ + stats_db_name=${stats_db_name} + openaire_db_name=${openaire_db_name} + +- ++ + + + +@@ -277,7 +277,7 @@ + ${stats_db_name} + contexts.sh + +- ++ + + + +-- +2.17.1 + + +From fa1ec5b5e9b6038b3b565422af5c6406f21220d3 Mon Sep 17 00:00:00 2001 +From: antleb +Date: Wed, 10 Mar 2021 14:05:58 +0200 +Subject: [PATCH 4/8] fixed typo... + +--- + .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +index 2184cb8a..321500e2 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +@@ -277,7 +277,7 @@ + ${stats_db_name} + contexts.sh + +- ++ + + + +-- +2.17.1 + + +From 3c75a050443942b632cf8469b5af16a8c61e7569 Mon Sep 17 00:00:00 2001 +From: antleb +Date: Fri, 12 Mar 2021 13:47:04 +0200 +Subject: [PATCH 5/8] fixed a ton of typos + +--- + .../scripts/computeProductionStats.sql | 8 ------- + .../stats/oozie_app/updateProductionViews.sh | 18 ++++++++++++++++ + .../dhp/oa/graph/stats/oozie_app/contexts.sh | 21 ++++++++++++------- + 3 files changed, 32 insertions(+), 15 deletions(-) + delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql + create mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh + +diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql +deleted file mode 100644 +index 34e48a18..00000000 +--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql ++++ /dev/null +@@ -1,8 +0,0 @@ +------------------------------------------------------- +------------------------------------------------------- +--- Impala table statistics - Needed to make the tables +--- visible for impala +------------------------------------------------------- +------------------------------------------------------- +- +-INVALIDATE METADATA ${stats_db_name}; +diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh +new file mode 100644 +index 00000000..57acb2ee +--- /dev/null ++++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh +@@ -0,0 +1,18 @@ ++export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs ++export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) ++if ! [ -L $link_folder ] ++then ++ rm -Rf "$link_folder" ++ ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} ++fi ++ ++export SOURCE=$1 ++export SHADOW=$2 ++ ++echo "Updating shadow database" ++impala-shell -d ${SOURCE} -q "invalidate metadata" ++impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${SOURCE}.\1;/" | impala-shell -c -f - ++impala-shell -q "create database if not exists ${SHADOW}" ++impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f - ++impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - ++echo "Shadow db ready!" +\ No newline at end of file +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +index 6788f88b..c28be50d 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +@@ -1,4 +1,10 @@ +-#!/usr/bin/env bash ++export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs ++export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) ++if ! [ -L $link_folder ] ++then ++ rm -Rf "$link_folder" ++ ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} ++fi + + CONTEXT_API=$1 + TARGET_DB=$2 +@@ -20,12 +26,13 @@ hdfs dfs -copyFromLocal concepts.csv ${TMP} + hdfs dfs -chmod -R 777 ${TMP} + + echo "Creating and populating impala tables" +-impala-shell -c "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ',';" +-impala-shell -c "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ',';" +-impala-shell -c "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ',';" +-impala-shell -c "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context;" +-impala-shell -c "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category;" +-impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept;" ++impala-shell -q "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" ++impala-shell -q "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','" ++impala-shell -q "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','" ++impala-shell -d ${TARGET_DB} -q "invalidate metadata" ++impala-shell -q "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context" ++impala-shell -q "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category" ++impala-shell -q "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept" + + echo "Cleaning up" + hdfs dfs -rm -f -r -skipTrash ${TMP} +-- +2.17.1 + + +From 236435b47010ea1ab94c3f018dcf278f5d2c44aa Mon Sep 17 00:00:00 2001 +From: antleb +Date: Fri, 12 Mar 2021 14:11:21 +0200 +Subject: [PATCH 6/8] following redirects + +--- + .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +index c28be50d..29b225e3 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +@@ -12,9 +12,9 @@ TARGET_DB=$2 + TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 contexts.csv +-cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv +-cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv ++curl -L ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv ++cat contexts.csv | cut -d , -f1 | xargs -I {} curl -L ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv ++cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl -L ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv + cat contexts.csv | sed 's/^\(.*\),\(.*\)/\1,\1::other,\2/' >> categories.csv + cat categories.csv | grep -v ::other | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv + +-- +2.17.1 + + +From 60ebdf2dbe704733809f401df70bffcf49cede29 Mon Sep 17 00:00:00 2001 +From: antleb +Date: Fri, 12 Mar 2021 16:34:53 +0200 +Subject: [PATCH 7/8] update promote wf to support monitor&production + +--- + .../oa/graph/stats/oozie_app/impala-shell.sh | 18 -- + .../scripts/updateProductionViews.sql | 207 ------------------ + 2 files changed, 225 deletions(-) + delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh + delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql + +diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh +deleted file mode 100644 +index 70112dc7..00000000 +--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh ++++ /dev/null +@@ -1,18 +0,0 @@ +-export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +-export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +-if ! [ -L $link_folder ] +-then +- rm -Rf "$link_folder" +- ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +-fi +- +-echo "Getting file from " $3 +-hdfs dfs -copyToLocal $3 +- +-echo "Running impala shell make the new database visible" +-impala-shell -q "INVALIDATE METADATA;" +- +-echo "Running impala shell to compute new table stats" +-impala-shell -d $1 -f $2 +-echo "Impala shell finished" +-rm $2 +diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql +deleted file mode 100644 +index 48f8d58f..00000000 +--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql ++++ /dev/null +@@ -1,207 +0,0 @@ +------------------------------------------------------- +------------------------------------------------------- +--- Shadow schema table exchange +------------------------------------------------------- +------------------------------------------------------- +- +--- Dropping old views +-DROP VIEW IF EXISTS ${stats_db_production_name}.category; +-DROP VIEW IF EXISTS ${stats_db_production_name}.concept; +-DROP VIEW IF EXISTS ${stats_db_production_name}.context; +-DROP VIEW IF EXISTS ${stats_db_production_name}.country; +-DROP VIEW IF EXISTS ${stats_db_production_name}.countrygdp; +-DROP VIEW IF EXISTS ${stats_db_production_name}.creation_date; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_citations; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_classifications; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_concepts; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_datasources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_languages; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_licenses; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_oids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_pids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_refereed; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_sources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_topics; +-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource; +-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_languages; +-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_oids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_organizations; +-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_results; +-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_sources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.funder; +-DROP VIEW IF EXISTS ${stats_db_production_name}.fundref; +-DROP VIEW IF EXISTS ${stats_db_production_name}.numbers_country; +-DROP VIEW IF EXISTS ${stats_db_production_name}.organization; +-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_datasources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_pids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_projects; +-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_sources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_citations; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_classifications; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_concepts; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_datasources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_languages; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_licenses; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_oids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_pids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_refereed; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_sources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_topics; +-DROP VIEW IF EXISTS ${stats_db_production_name}.project; +-DROP VIEW IF EXISTS ${stats_db_production_name}.project_oids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.project_organizations; +-DROP VIEW IF EXISTS ${stats_db_production_name}.project_results; +-DROP VIEW IF EXISTS ${stats_db_production_name}.project_resultcount; +-DROP VIEW IF EXISTS ${stats_db_production_name}.project_results_publication; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_citations; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_classifications; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_concepts; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_datasources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_languages; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_licenses; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_oids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_pids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_refereed; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_sources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_topics; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_affiliated_country; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_citations; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_classifications; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_concepts; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_datasources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_deposited_country; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_fundercount; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_gold; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_greenoa; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_languages; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_licenses; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_oids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_organization; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_peerreviewed; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_pids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_projectcount; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_projects; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_refereed; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_sources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_topics; +-DROP VIEW IF EXISTS ${stats_db_production_name}.rndexpediture; +-DROP VIEW IF EXISTS ${stats_db_production_name}.roarmap; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_citations; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_classifications; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_concepts; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_datasources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_languages; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_licenses; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_oids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_pids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_refereed; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_sources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_topics; +- +- +--- Creating the shadow database, in case it doesn't exist +-CREATE database IF NOT EXISTS ${stats_db_production_name}; +- +--- Creating new views +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.category AS SELECT * FROM ${stats_db_name}.category; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.concept AS SELECT * FROM ${stats_db_name}.concept; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.context AS SELECT * FROM ${stats_db_name}.context; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.country AS SELECT * FROM ${stats_db_name}.country; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.countrygdp AS SELECT * FROM ${stats_db_name}.countrygdp; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.creation_date AS SELECT * FROM ${stats_db_name}.creation_date; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset AS SELECT * FROM ${stats_db_name}.dataset; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_citations AS SELECT * FROM ${stats_db_name}.dataset_citations; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_classifications AS SELECT * FROM ${stats_db_name}.dataset_classifications; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_concepts AS SELECT * FROM ${stats_db_name}.dataset_concepts; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_datasources AS SELECT * FROM ${stats_db_name}.dataset_datasources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_languages AS SELECT * FROM ${stats_db_name}.dataset_languages; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_licenses AS SELECT * FROM ${stats_db_name}.dataset_licenses; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_oids AS SELECT * FROM ${stats_db_name}.dataset_oids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_pids AS SELECT * FROM ${stats_db_name}.dataset_pids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_refereed AS SELECT * FROM ${stats_db_name}.dataset_refereed; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_sources AS SELECT * FROM ${stats_db_name}.dataset_sources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_topics AS SELECT * FROM ${stats_db_name}.dataset_topics; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource AS SELECT * FROM ${stats_db_name}.datasource; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_languages AS SELECT * FROM ${stats_db_name}.datasource_languages; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_oids AS SELECT * FROM ${stats_db_name}.datasource_oids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_organizations AS SELECT * FROM ${stats_db_name}.datasource_organizations; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_results AS SELECT * FROM ${stats_db_name}.datasource_results; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_sources AS SELECT * FROM ${stats_db_name}.datasource_sources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.funder AS SELECT * FROM ${stats_db_name}.funder; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.fundref AS SELECT * FROM ${stats_db_name}.fundref; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.numbers_country AS SELECT * FROM ${stats_db_name}.numbers_country; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization AS SELECT * FROM ${stats_db_name}.organization; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_datasources AS SELECT * FROM ${stats_db_name}.organization_datasources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_pids AS SELECT * FROM ${stats_db_name}.organization_pids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_projects AS SELECT * FROM ${stats_db_name}.organization_projects; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_sources AS SELECT * FROM ${stats_db_name}.organization_sources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct AS SELECT * FROM ${stats_db_name}.otherresearchproduct; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_citations AS SELECT * FROM ${stats_db_name}.otherresearchproduct_citations; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_classifications AS SELECT * FROM ${stats_db_name}.otherresearchproduct_classifications; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_concepts AS SELECT * FROM ${stats_db_name}.otherresearchproduct_concepts; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_datasources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_datasources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_languages AS SELECT * FROM ${stats_db_name}.otherresearchproduct_languages; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_licenses AS SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_oids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_oids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_pids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_pids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_refereed AS SELECT * FROM ${stats_db_name}.otherresearchproduct_refereed; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_sources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_topics AS SELECT * FROM ${stats_db_name}.otherresearchproduct_topics; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project AS SELECT * FROM ${stats_db_name}.project; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_oids AS SELECT * FROM ${stats_db_name}.project_oids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_organizations AS SELECT * FROM ${stats_db_name}.project_organizations; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results AS SELECT * FROM ${stats_db_name}.project_results; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_resultcount AS SELECT * FROM ${stats_db_name}.project_resultcount; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results_publication AS SELECT * FROM ${stats_db_name}.project_results_publication; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication AS SELECT * FROM ${stats_db_name}.publication; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_citations AS SELECT * FROM ${stats_db_name}.publication_citations; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_classifications AS SELECT * FROM ${stats_db_name}.publication_classifications; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_concepts AS SELECT * FROM ${stats_db_name}.publication_concepts; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_datasources AS SELECT * FROM ${stats_db_name}.publication_datasources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_languages AS SELECT * FROM ${stats_db_name}.publication_languages; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_oids AS SELECT * FROM ${stats_db_name}.publication_oids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_pids AS SELECT * FROM ${stats_db_name}.publication_pids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_refereed AS SELECT * FROM ${stats_db_name}.publication_refereed; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_sources AS SELECT * FROM ${stats_db_name}.publication_sources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_topics AS SELECT * FROM ${stats_db_name}.publication_topics; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result AS SELECT * FROM ${stats_db_name}.result; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_affiliated_country AS SELECT * FROM ${stats_db_name}.result_affiliated_country; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_citations AS SELECT * FROM ${stats_db_name}.result_citations; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_classifications AS SELECT * FROM ${stats_db_name}.result_classifications; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_concepts AS SELECT * FROM ${stats_db_name}.result_concepts; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_datasources AS SELECT * FROM ${stats_db_name}.result_datasources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_deposited_country AS SELECT * FROM ${stats_db_name}.result_deposited_country; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_fundercount AS SELECT * FROM ${stats_db_name}.result_fundercount; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_gold AS SELECT * FROM ${stats_db_name}.result_gold; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_greenoa AS SELECT * FROM ${stats_db_name}.result_greenoa; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_languages AS SELECT * FROM ${stats_db_name}.result_languages; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_licenses AS SELECT * FROM ${stats_db_name}.result_licenses; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_oids AS SELECT * FROM ${stats_db_name}.result_oids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_organization AS SELECT * FROM ${stats_db_name}.result_organization; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_peerreviewed AS SELECT * FROM ${stats_db_name}.result_peerreviewed; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_pids AS SELECT * FROM ${stats_db_name}.result_pids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projectcount AS SELECT * FROM ${stats_db_name}.result_projectcount; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projects AS SELECT * FROM ${stats_db_name}.result_projects; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_refereed AS SELECT * FROM ${stats_db_name}.result_refereed; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_sources AS SELECT * FROM ${stats_db_name}.result_sources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_topics AS SELECT * FROM ${stats_db_name}.result_topics; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.rndexpediture AS SELECT * FROM ${stats_db_name}.rndexpediture; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.roarmap AS SELECT * FROM ${stats_db_name}.roarmap; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software AS SELECT * FROM ${stats_db_name}.software; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_citations AS SELECT * FROM ${stats_db_name}.software_citations; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_classifications AS SELECT * FROM ${stats_db_name}.software_classifications; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_concepts AS SELECT * FROM ${stats_db_name}.software_concepts; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_datasources AS SELECT * FROM ${stats_db_name}.software_datasources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_languages AS SELECT * FROM ${stats_db_name}.software_languages; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_licenses AS SELECT * FROM ${stats_db_name}.software_licenses; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_oids AS SELECT * FROM ${stats_db_name}.software_oids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_pids AS SELECT * FROM ${stats_db_name}.software_pids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_refereed AS SELECT * FROM ${stats_db_name}.software_refereed; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_sources AS SELECT * FROM ${stats_db_name}.software_sources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_topics AS SELECT * FROM ${stats_db_name}.software_topics; +-- +2.17.1 + + +From 0ba0a6b9dac25f5ec73e8eafefbf7f91442ad1c5 Mon Sep 17 00:00:00 2001 +From: antleb +Date: Fri, 12 Mar 2021 16:42:59 +0200 +Subject: [PATCH 8/8] update promote wf to support monitor&production + +--- + .../stats/oozie_app/updateProductionViews.sh | 14 +++---- + .../dhp/oa/graph/stats/oozie_app/workflow.xml | 37 ++++++++++++------- + 2 files changed, 29 insertions(+), 22 deletions(-) + +diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh +index 57acb2ee..3e510e87 100644 +--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh ++++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh +@@ -7,12 +7,10 @@ then + fi + + export SOURCE=$1 +-export SHADOW=$2 ++export PRODUCTION=$2 + +-echo "Updating shadow database" +-impala-shell -d ${SOURCE} -q "invalidate metadata" +-impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${SOURCE}.\1;/" | impala-shell -c -f - +-impala-shell -q "create database if not exists ${SHADOW}" +-impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f - +-impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - +-echo "Shadow db ready!" +\ No newline at end of file ++echo "Updating ${PRODUCTION} database" ++impala-shell -q "create database if not exists ${PRODUCTION}" ++impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f - ++impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - ++echo "Production db ready!" +\ No newline at end of file +diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +index d744f18d..0d8ff7ee 100644 +--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml ++++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +@@ -6,7 +6,15 @@ + + + stats_db_production_name +- the name of the production schema ++ the name of the public production schema ++ ++ ++ monitor_db_name ++ the monitor database name ++ ++ ++ monitor_db_production_name ++ the name of the monitor public database + + + stats_tool_api_url +@@ -48,25 +56,26 @@ + + + +- +- ${hive_jdbc_url} +- +- stats_db_name=${stats_db_name} +- stats_db_production_name=${stats_db_production_name} +- +- ++ ++ ${jobTracker} ++ ${nameNode} ++ updateProductionViews.sh ++ ${stats_db_name} ++ ${stats_db_production_name} ++ updateProductionViews.sh ++ ++ + + + +- ++ + + ${jobTracker} + ${nameNode} +- impala-shell.sh +- ${stats_db_production_name} +- computeProductionStats.sql +- ${wf:appPath()}/scripts/computeProductionStats.sql +- impala-shell.sh ++ updateProductionViews.sh ++ ${monitor_db_name} ++ ${monitor_db_production_name} ++ updateProductionViews.sh + + + +-- +2.17.1 + diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 3f4f11243..84c52ce8e 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -6,7 +6,8 @@ eu.dnetlib.dhp dhp 1.2.4-SNAPSHOT - ../ + ../pom.xml + dhp-common @@ -29,12 +30,6 @@ spark-sql_2.11 - - eu.dnetlib.dhp - dhp-schemas - ${project.version} - - commons-cli commons-cli @@ -59,11 +54,6 @@ com.fasterxml.jackson.core jackson-databind - - - com.rabbitmq - amqp-client - net.sf.saxon Saxon-HE @@ -104,6 +94,21 @@ dnet-pace-core + + org.apache.httpcomponents + httpclient + + + + org.mongodb + mongo-java-driver + + + + eu.dnetlib.dhp + dhp-schemas + ${project.version} + diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java index 68fc024af..59fe941ed 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java @@ -2,12 +2,16 @@ package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; +import java.util.Date; +import java.util.Objects; import java.util.UUID; import javax.persistence.Column; import javax.persistence.Entity; import javax.persistence.Id; import javax.persistence.Table; +import javax.persistence.Temporal; +import javax.persistence.TemporalType; @Entity @Table(name = "mdstores") @@ -38,6 +42,13 @@ public class MDStore implements Serializable { @Column(name = "api_id") private String apiId; + @Column(name = "hdfs_path") + private String hdfsPath; + + @Column(name = "creation_date") + @Temporal(TemporalType.TIMESTAMP) + private Date creationDate; + public String getId() { return id; } @@ -94,9 +105,28 @@ public class MDStore implements Serializable { this.apiId = apiId; } + public String getHdfsPath() { + return hdfsPath; + } + + public void setHdfsPath(final String hdfsPath) { + this.hdfsPath = hdfsPath; + } + + public Date getCreationDate() { + return creationDate; + } + + public void setCreationDate(final Date creationDate) { + this.creationDate = creationDate; + } + public static MDStore newInstance( - final String format, final String layout, final String interpretation) { - return newInstance(format, layout, interpretation, null, null, null); + final String format, + final String layout, + final String interpretation, + final String hdfsBasePath) { + return newInstance(format, layout, interpretation, null, null, null, hdfsBasePath); } public static MDStore newInstance( @@ -105,15 +135,48 @@ public class MDStore implements Serializable { final String interpretation, final String dsName, final String dsId, - final String apiId) { + final String apiId, + final String hdfsBasePath) { + + final String mdId = "md-" + UUID.randomUUID(); + final MDStore md = new MDStore(); - md.setId("md-" + UUID.randomUUID()); + md.setId(mdId); md.setFormat(format); md.setLayout(layout); md.setInterpretation(interpretation); + md.setCreationDate(new Date()); md.setDatasourceName(dsName); md.setDatasourceId(dsId); md.setApiId(apiId); + md.setHdfsPath(String.format("%s/%s", hdfsBasePath, mdId)); + return md; } + + @Override + public String toString() { + return String + .format( + "MDStore [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, hdfsPath=%s, creationDate=%s]", + id, format, layout, interpretation, datasourceName, datasourceId, apiId, hdfsPath, creationDate); + } + + @Override + public int hashCode() { + return Objects.hash(id); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof MDStore)) { + return false; + } + final MDStore other = (MDStore) obj; + return Objects.equals(id, other.id); + } + } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java index f74ab39be..d808e2de7 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java @@ -2,6 +2,7 @@ package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; +import java.util.Objects; import javax.persistence.Column; import javax.persistence.Entity; @@ -48,4 +49,26 @@ public class MDStoreCurrentVersion implements Serializable { public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) { return newInstance(v.getMdstore(), v.getId()); } + + @Override + public String toString() { + return String.format("MDStoreCurrentVersion [mdstore=%s, currentVersion=%s]", mdstore, currentVersion); + } + + @Override + public int hashCode() { + return Objects.hash(currentVersion, mdstore); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof MDStoreCurrentVersion)) { + return false; + } + final MDStoreCurrentVersion other = (MDStoreCurrentVersion) obj; + return Objects.equals(currentVersion, other.currentVersion) && Objects.equals(mdstore, other.mdstore); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java index 7ef24f191..38f8f275e 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java @@ -3,6 +3,7 @@ package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; import java.util.Date; +import java.util.Objects; import javax.persistence.Column; import javax.persistence.Entity; @@ -38,15 +39,22 @@ public class MDStoreVersion implements Serializable { @Column(name = "size") private long size = 0; - public static MDStoreVersion newInstance(final String mdId, final boolean writing) { - final MDStoreVersion t = new MDStoreVersion(); - t.setId(mdId + "-" + new Date().getTime()); - t.setMdstore(mdId); - t.setLastUpdate(null); - t.setWriting(writing); - t.setReadCount(0); - t.setSize(0); - return t; + @Column(name = "hdfs_path") + private String hdfsPath; + + public static MDStoreVersion newInstance(final String mdId, final boolean writing, final String hdfsBasePath) { + final MDStoreVersion v = new MDStoreVersion(); + + final String versionId = mdId + "-" + new Date().getTime(); + v.setId(versionId); + v.setMdstore(mdId); + v.setLastUpdate(null); + v.setWriting(writing); + v.setReadCount(0); + v.setSize(0); + v.setHdfsPath(String.format("%s/%s/%s", hdfsBasePath, mdId, versionId)); + + return v; } public String getId() { @@ -96,4 +104,37 @@ public class MDStoreVersion implements Serializable { public void setSize(final long size) { this.size = size; } + + public String getHdfsPath() { + return hdfsPath; + } + + public void setHdfsPath(final String hdfsPath) { + this.hdfsPath = hdfsPath; + } + + @Override + public String toString() { + return String + .format( + "MDStoreVersion [id=%s, mdstore=%s, writing=%s, readCount=%s, lastUpdate=%s, size=%s, hdfsPath=%s]", id, + mdstore, writing, readCount, lastUpdate, size, hdfsPath); + } + + @Override + public int hashCode() { + return Objects.hash(id); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof MDStoreVersion)) { + return false; + } + final MDStoreVersion other = (MDStoreVersion) obj; + return Objects.equals(id, other.id); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java index 438359241..510c65092 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java @@ -3,6 +3,7 @@ package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; import java.util.Date; +import java.util.Objects; import javax.persistence.Column; import javax.persistence.Entity; @@ -43,6 +44,10 @@ public class MDStoreWithInfo implements Serializable { @Column(name = "current_version") private String currentVersion; + @Column(name = "creation_date") + @Temporal(TemporalType.TIMESTAMP) + private Date creationDate; + @Column(name = "lastupdate") @Temporal(TemporalType.TIMESTAMP) private Date lastUpdate; @@ -53,6 +58,9 @@ public class MDStoreWithInfo implements Serializable { @Column(name = "n_versions") private long numberOfVersions = 0; + @Column(name = "hdfs_path") + private String hdfsPath; + public String getId() { return id; } @@ -117,6 +125,14 @@ public class MDStoreWithInfo implements Serializable { this.currentVersion = currentVersion; } + public Date getCreationDate() { + return creationDate; + } + + public void setCreationDate(final Date creationDate) { + this.creationDate = creationDate; + } + public Date getLastUpdate() { return lastUpdate; } @@ -140,4 +156,39 @@ public class MDStoreWithInfo implements Serializable { public void setNumberOfVersions(final long numberOfVersions) { this.numberOfVersions = numberOfVersions; } + + public String getHdfsPath() { + return hdfsPath; + } + + public void setHdfsPath(final String hdfsPath) { + this.hdfsPath = hdfsPath; + } + + @Override + public String toString() { + return String + .format( + "MDStoreWithInfo [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, currentVersion=%s, creationDate=%s, lastUpdate=%s, size=%s, numberOfVersions=%s, hdfsPath=%s]", + id, format, layout, interpretation, datasourceName, datasourceId, apiId, currentVersion, creationDate, + lastUpdate, size, numberOfVersions, hdfsPath); + } + + @Override + public int hashCode() { + return Objects.hash(id); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof MDStoreWithInfo)) { + return false; + } + final MDStoreWithInfo other = (MDStoreWithInfo) obj; + return Objects.equals(id, other.id); + } + } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java new file mode 100644 index 000000000..c53b83561 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java @@ -0,0 +1,14 @@ + +package eu.dnetlib.dhp.application; + +import java.io.*; +import java.util.Map; +import java.util.Properties; + +import org.apache.hadoop.conf.Configuration; + +import com.google.common.collect.Maps; + +public class ApplicationUtils { + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java index e65b4bb0b..0429bc25d 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java @@ -1,10 +1,7 @@ package eu.dnetlib.dhp.application; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.Serializable; -import java.io.StringWriter; +import java.io.*; import java.util.*; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; @@ -12,17 +9,21 @@ import java.util.zip.GZIPOutputStream; import org.apache.commons.cli.*; import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; public class ArgumentApplicationParser implements Serializable { + private static final Logger log = LoggerFactory.getLogger(ArgumentApplicationParser.class); + private final Options options = new Options(); private final Map objectMap = new HashMap<>(); private final List compressedValues = new ArrayList<>(); - public ArgumentApplicationParser(final String json_configuration) throws Exception { + public ArgumentApplicationParser(final String json_configuration) throws IOException { final ObjectMapper mapper = new ObjectMapper(); final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class); createOptionMap(configuration); @@ -33,7 +34,6 @@ public class ArgumentApplicationParser implements Serializable { } private void createOptionMap(final OptionsParameter[] configuration) { - Arrays .stream(configuration) .map( @@ -47,10 +47,6 @@ public class ArgumentApplicationParser implements Serializable { return o; }) .forEach(options::addOption); - - // HelpFormatter formatter = new HelpFormatter(); - // formatter.printHelp("myapp", null, options, null, true); - } public static String decompressValue(final String abstractCompressed) { @@ -61,7 +57,7 @@ public class ArgumentApplicationParser implements Serializable { IOUtils.copy(gis, stringWriter); return stringWriter.toString(); } catch (Throwable e) { - System.out.println("Wrong value to decompress:" + abstractCompressed); + log.error("Wrong value to decompress:" + abstractCompressed); throw new RuntimeException(e); } } @@ -74,7 +70,7 @@ public class ArgumentApplicationParser implements Serializable { return java.util.Base64.getEncoder().encodeToString(out.toByteArray()); } - public void parseArgument(final String[] args) throws Exception { + public void parseArgument(final String[] args) throws ParseException { CommandLineParser parser = new BasicParser(); CommandLine cmd = parser.parse(options, args); Arrays diff --git a/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java b/dhp-common/src/main/java/eu/dnetlib/dhp/collection/ApiDescriptor.java similarity index 94% rename from dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/collection/ApiDescriptor.java index bfd70e8c6..12937a197 100644 --- a/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/collection/ApiDescriptor.java @@ -1,5 +1,5 @@ -package eu.dnetlib.collector.worker.model; +package eu.dnetlib.dhp.collection; import java.util.HashMap; import java.util.Map; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java index 2b8ef4e30..eb4cb91ed 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java @@ -27,4 +27,24 @@ public class Constants { coarCodeLabelMap.put("c_f1cf", "EMBARGO"); } + public static final String SEQUENCE_FILE_NAME = "/sequence_file"; + public static final String REPORT_FILE_NAME = "/report"; + public static final String MDSTORE_DATA_PATH = "/store"; + public static final String MDSTORE_SIZE_PATH = "/size"; + + public static final String COLLECTION_MODE = "collectionMode"; + public static final String METADATA_ENCODING = "metadataEncoding"; + public static final String OOZIE_WF_PATH = "oozieWfPath"; + public static final String DNET_MESSAGE_MGR_URL = "dnetMessageManagerURL"; + + public static final String MAX_NUMBER_OF_RETRY = "maxNumberOfRetry"; + public static final String REQUEST_DELAY = "requestDelay"; + public static final String RETRY_DELAY = "retryDelay"; + public static final String CONNECT_TIMEOUT = "connectTimeOut"; + public static final String READ_TIMEOUT = "readTimeOut"; + + public static final String CONTENT_TOTALITEMS = "TotalItems"; + public static final String CONTENT_INVALIDRECORDS = "InvalidRecords"; + public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems"; + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java similarity index 79% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java index a2177935a..0bc782ccb 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java @@ -1,39 +1,60 @@ -package eu.dnetlib.dhp.oa.graph.raw.common; +package eu.dnetlib.dhp.common; import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; +import java.util.Optional; import java.util.stream.StreamSupport; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.bson.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.google.common.collect.Iterables; +import com.mongodb.BasicDBObject; import com.mongodb.MongoClient; import com.mongodb.MongoClientURI; +import com.mongodb.QueryBuilder; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; public class MdstoreClient implements Closeable { + private static final Logger log = LoggerFactory.getLogger(MdstoreClient.class); + private final MongoClient client; private final MongoDatabase db; private static final String COLL_METADATA = "metadata"; private static final String COLL_METADATA_MANAGER = "metadataManager"; - private static final Log log = LogFactory.getLog(MdstoreClient.class); - public MdstoreClient(final String baseUrl, final String dbName) { this.client = new MongoClient(new MongoClientURI(baseUrl)); this.db = getDb(client, dbName); } + public MongoCollection mdStore(final String mdId) { + BasicDBObject query = (BasicDBObject) QueryBuilder.start("mdId").is(mdId).get(); + + log.info("querying current mdId: {}", query.toJson()); + + final String currentId = Optional + .ofNullable(getColl(db, COLL_METADATA_MANAGER, true).find(query)) + .map(r -> r.first()) + .map(d -> d.getString("currentId")) + .orElseThrow(() -> new IllegalArgumentException("cannot find current mdstore id for: " + mdId)); + + log.info("currentId: {}", currentId); + + return getColl(db, currentId, true); + } + public Map validCollections( final String mdFormat, final String mdLayout, final String mdInterpretation) { diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java new file mode 100644 index 000000000..853d22bc2 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java @@ -0,0 +1,72 @@ + +package eu.dnetlib.dhp.common.rest; + +import java.util.Arrays; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.client.methods.HttpUriRequest; +import org.apache.http.entity.StringEntity; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +public class DNetRestClient { + + private static final Logger log = LoggerFactory.getLogger(DNetRestClient.class); + + private static ObjectMapper mapper = new ObjectMapper(); + + public static T doGET(final String url, Class clazz) throws Exception { + final HttpGet httpGet = new HttpGet(url); + return doHTTPRequest(httpGet, clazz); + } + + public static String doGET(final String url) throws Exception { + final HttpGet httpGet = new HttpGet(url); + return doHTTPRequest(httpGet); + } + + public static String doPOST(final String url, V objParam) throws Exception { + final HttpPost httpPost = new HttpPost(url); + + if (objParam != null) { + final StringEntity entity = new StringEntity(mapper.writeValueAsString(objParam)); + httpPost.setEntity(entity); + httpPost.setHeader("Accept", "application/json"); + httpPost.setHeader("Content-type", "application/json"); + } + return doHTTPRequest(httpPost); + } + + public static T doPOST(final String url, V objParam, Class clazz) throws Exception { + return mapper.readValue(doPOST(url, objParam), clazz); + } + + private static String doHTTPRequest(final HttpUriRequest r) throws Exception { + CloseableHttpClient client = HttpClients.createDefault(); + + log.info("performing HTTP request, method {} on URI {}", r.getMethod(), r.getURI().toString()); + log + .info( + "request headers: {}", + Arrays + .asList(r.getAllHeaders()) + .stream() + .map(h -> h.getName() + ":" + h.getValue()) + .collect(Collectors.joining(","))); + + CloseableHttpResponse response = client.execute(r); + return IOUtils.toString(response.getEntity().getContent()); + } + + private static T doHTTPRequest(final HttpUriRequest r, Class clazz) throws Exception { + return mapper.readValue(doHTTPRequest(r), clazz); + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java similarity index 98% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java index bfc4fd6f1..1e333e93f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.oa.graph.raw.common; +package eu.dnetlib.dhp.common.vocabulary; import java.io.Serializable; import java.util.HashMap; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java similarity index 84% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java index 32452bdc5..12c6279e5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.oa.graph.raw.common; +package eu.dnetlib.dhp.common.vocabulary; import java.io.Serializable; import java.util.*; @@ -67,6 +67,10 @@ public class VocabularyGroup implements Serializable { private final Map vocs = new HashMap<>(); + public Set vocabularyNames() { + return vocs.keySet(); + } + public void addVocabulary(final String id, final String name) { vocs.put(id.toLowerCase(), new Vocabulary(id, name)); } @@ -118,7 +122,31 @@ public class VocabularyGroup implements Serializable { return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn); } + /** + * getSynonymAsQualifierCaseSensitive + * + * refelects the situation to check caseSensitive vocabulary + */ + public Qualifier getSynonymAsQualifierCaseSensitive(final String vocId, final String syn) { + if (StringUtils.isBlank(vocId)) { + return OafMapperUtils.unknown("", ""); + } + return vocs.get(vocId).getSynonymAsQualifier(syn); + } + + /** + * termExists + * + * two methods: without and with caseSensitive check + */ public boolean termExists(final String vocId, final String id) { + return termExists(vocId, id, Boolean.FALSE); + } + + public boolean termExists(final String vocId, final String id, final Boolean caseSensitive) { + if (Boolean.TRUE.equals(caseSensitive)) { + return vocabularyExists(vocId) && vocs.get(vocId).termExists(id); + } return vocabularyExists(vocId) && vocs.get(vocId.toLowerCase()).termExists(id); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyTerm.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyTerm.java similarity index 88% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyTerm.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyTerm.java index 1aa1b8253..52eb7ca23 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyTerm.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyTerm.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.oa.graph.raw.common; +package eu.dnetlib.dhp.common.vocabulary; import java.io.Serializable; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java new file mode 100644 index 000000000..f1107b4b8 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java @@ -0,0 +1,64 @@ + +package eu.dnetlib.dhp.message; + +import java.io.Serializable; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; + +public class Message implements Serializable { + + private static final long serialVersionUID = 401753881204524893L; + + public static String CURRENT_PARAM = "current"; + public static String TOTAL_PARAM = "total"; + + private MessageType messageType; + + private String workflowId; + + private Map body; + + public Message() { + } + + public Message(final MessageType messageType, final String workflowId) { + this(messageType, workflowId, new LinkedHashMap<>()); + } + + public Message(final MessageType messageType, final String workflowId, final Map body) { + this.messageType = messageType; + this.workflowId = workflowId; + this.body = body; + } + + public MessageType getMessageType() { + return messageType; + } + + public void setMessageType(MessageType messageType) { + this.messageType = messageType; + } + + public String getWorkflowId() { + return workflowId; + } + + public void setWorkflowId(final String workflowId) { + this.workflowId = workflowId; + } + + public Map getBody() { + return body; + } + + public void setBody(final Map body) { + this.body = body; + } + + @Override + public String toString() { + return String.format("Message [type=%s, workflowId=%s, body=%s]", messageType, workflowId, body); + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java new file mode 100644 index 000000000..0c6eacf99 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java @@ -0,0 +1,94 @@ + +package eu.dnetlib.dhp.message; + +import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpPut; +import org.apache.http.entity.ContentType; +import org.apache.http.entity.StringEntity; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +public class MessageSender { + + private static final Logger log = LoggerFactory.getLogger(MessageSender.class); + + private static final int SOCKET_TIMEOUT_MS = 2000; + + private static final int CONNECTION_REQUEST_TIMEOUT_MS = 2000; + + private static final int CONNTECTION_TIMEOUT_MS = 2000; + + private final ObjectMapper objectMapper = new ObjectMapper(); + + private final String dnetMessageEndpoint; + + private final String workflowId; + + private ExecutorService executorService = Executors.newCachedThreadPool(); + + public MessageSender(final String dnetMessageEndpoint, final String workflowId) { + this.workflowId = workflowId; + this.dnetMessageEndpoint = dnetMessageEndpoint; + } + + public void sendMessage(final Message message) { + executorService.submit(() -> _sendMessage(message)); + } + + public void sendMessage(final Long current, final Long total) { + sendMessage(createOngoingMessage(current, total)); + } + + public void sendReport(final Map report) { + sendMessage(new Message(MessageType.REPORT, workflowId, report)); + } + + private Message createOngoingMessage(final Long current, final Long total) { + final Message m = new Message(MessageType.ONGOING, workflowId); + m.getBody().put(Message.CURRENT_PARAM, current.toString()); + if (total != null) { + m.getBody().put(Message.TOTAL_PARAM, total.toString()); + } + return m; + } + + private void _sendMessage(final Message message) { + try { + final String json = objectMapper.writeValueAsString(message); + + final HttpPut req = new HttpPut(dnetMessageEndpoint); + req.setEntity(new StringEntity(json, ContentType.APPLICATION_JSON)); + + final RequestConfig requestConfig = RequestConfig + .custom() + .setConnectTimeout(CONNTECTION_TIMEOUT_MS) + .setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MS) + .setSocketTimeout(SOCKET_TIMEOUT_MS) + .build(); + + try (final CloseableHttpClient client = HttpClients + .custom() + .setDefaultRequestConfig(requestConfig) + .build(); + final CloseableHttpResponse response = client.execute(req)) { + log.debug("Sent Message to " + dnetMessageEndpoint); + log.debug("MESSAGE:" + message); + } catch (final Throwable e) { + log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e); + } + } catch (final JsonProcessingException e) { + log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e); + } + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageType.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageType.java new file mode 100644 index 000000000..75ffb8ef5 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageType.java @@ -0,0 +1,21 @@ + +package eu.dnetlib.dhp.message; + +import java.io.Serializable; +import java.util.Optional; + +import org.apache.commons.lang3.StringUtils; + +public enum MessageType implements Serializable { + + ONGOING, REPORT; + + public MessageType from(String value) { + return Optional + .ofNullable(value) + .map(StringUtils::upperCase) + .map(MessageType::valueOf) + .orElseThrow(() -> new IllegalArgumentException("unknown message type: " + value)); + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java index 1cee3058e..401d5d444 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java @@ -149,17 +149,26 @@ public class CleaningFunctions { if (Objects.nonNull(r.getInstance())) { for (Instance i : r.getInstance()) { - final Set pids = Sets.newHashSet(i.getPid()); - i - .setAlternateIdentifier( - Optional - .ofNullable(i.getAlternateIdentifier()) - .map( - altId -> altId + Optional + .ofNullable(i.getPid()) + .ifPresent(pid -> { + final Set pids = + pid .stream() - .filter(p -> !pids.contains(p)) - .collect(Collectors.toList())) - .orElse(Lists.newArrayList())); + .filter(Objects::nonNull) + .filter(p -> StringUtils.isNotBlank(p.getValue())) + .collect(Collectors.toCollection(HashSet::new)); + + Optional.ofNullable(i.getAlternateIdentifier()) + .ifPresent(altId -> { + final Set altIds = altId.stream() + .filter(Objects::nonNull) + .filter(p -> StringUtils.isNotBlank(p.getValue())) + .collect(Collectors.toCollection(HashSet::new)); + + i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids))); + }); + }); if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { i diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java index 8872174a5..8d760a2cd 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java @@ -1,18 +1,29 @@ package eu.dnetlib.dhp.utils; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; +import java.io.*; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; import java.util.List; +import java.util.Map; +import java.util.Properties; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.binary.Base64OutputStream; import org.apache.commons.codec.binary.Hex; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.SaveMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Maps; import com.jayway.jsonpath.JsonPath; import net.minidev.json.JSONArray; @@ -21,6 +32,8 @@ import scala.collection.Seq; public class DHPUtils { + private static final Logger log = LoggerFactory.getLogger(DHPUtils.class); + public static Seq toSeq(List list) { return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq(); } @@ -79,4 +92,72 @@ public class DHPUtils { return ""; } } + + public static final ObjectMapper MAPPER = new ObjectMapper(); + + public static void writeHdfsFile(final Configuration conf, final String content, final String path) + throws IOException { + + log.info("writing file {}, size {}", path, content.length()); + try (FileSystem fs = FileSystem.get(conf); + BufferedOutputStream os = new BufferedOutputStream(fs.create(new Path(path)))) { + os.write(content.getBytes(StandardCharsets.UTF_8)); + os.flush(); + } + } + + public static String readHdfsFile(Configuration conf, String path) throws IOException { + log.info("reading file {}", path); + + try (FileSystem fs = FileSystem.get(conf)) { + final Path p = new Path(path); + if (!fs.exists(p)) { + throw new FileNotFoundException(path); + } + return IOUtils.toString(fs.open(p)); + } + } + + public static T readHdfsFileAs(Configuration conf, String path, Class clazz) throws IOException { + return MAPPER.readValue(readHdfsFile(conf, path), clazz); + } + + public static void saveDataset(final Dataset mdstore, final String targetPath) { + log.info("saving dataset in: {}", targetPath); + mdstore + .write() + .mode(SaveMode.Overwrite) + .format("parquet") + .save(targetPath); + } + + public static Configuration getHadoopConfiguration(String nameNode) { + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", nameNode); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + + System.setProperty("hadoop.home.dir", "/"); + return conf; + } + + public static void populateOOZIEEnv(final Map report) throws IOException { + File file = new File(System.getProperty("oozie.action.output.properties")); + Properties props = new Properties(); + report.forEach((k, v) -> props.setProperty(k, v)); + + try (OutputStream os = new FileOutputStream(file)) { + props.store(os, ""); + } + } + + public static void populateOOZIEEnv(final String paramName, String value) throws IOException { + Map report = Maps.newHashMap(); + report.put(paramName, value); + + populateOOZIEEnv(report); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/message/Message.java b/dhp-common/src/main/java/eu/dnetlib/message/Message.java deleted file mode 100644 index fc1c38291..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/message/Message.java +++ /dev/null @@ -1,76 +0,0 @@ - -package eu.dnetlib.message; - -import java.io.IOException; -import java.util.Map; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; - -public class Message { - - private String workflowId; - - private String jobName; - - private MessageType type; - - private Map body; - - public static Message fromJson(final String json) throws IOException { - final ObjectMapper jsonMapper = new ObjectMapper(); - return jsonMapper.readValue(json, Message.class); - } - - public Message() { - } - - public Message(String workflowId, String jobName, MessageType type, Map body) { - this.workflowId = workflowId; - this.jobName = jobName; - this.type = type; - this.body = body; - } - - public String getWorkflowId() { - return workflowId; - } - - public void setWorkflowId(String workflowId) { - this.workflowId = workflowId; - } - - public String getJobName() { - return jobName; - } - - public void setJobName(String jobName) { - this.jobName = jobName; - } - - public MessageType getType() { - return type; - } - - public void setType(MessageType type) { - this.type = type; - } - - public Map getBody() { - return body; - } - - public void setBody(Map body) { - this.body = body; - } - - @Override - public String toString() { - final ObjectMapper jsonMapper = new ObjectMapper(); - try { - return jsonMapper.writeValueAsString(this); - } catch (JsonProcessingException e) { - return null; - } - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java deleted file mode 100644 index fb3f0bd95..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java +++ /dev/null @@ -1,47 +0,0 @@ - -package eu.dnetlib.message; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.concurrent.LinkedBlockingQueue; - -import com.rabbitmq.client.AMQP; -import com.rabbitmq.client.Channel; -import com.rabbitmq.client.DefaultConsumer; -import com.rabbitmq.client.Envelope; - -public class MessageConsumer extends DefaultConsumer { - - final LinkedBlockingQueue queueMessages; - - /** - * Constructs a new instance and records its association to the passed-in channel. - * - * @param channel the channel to which this consumer is attached - * @param queueMessages - */ - public MessageConsumer(Channel channel, LinkedBlockingQueue queueMessages) { - super(channel); - this.queueMessages = queueMessages; - } - - @Override - public void handleDelivery( - String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body) - throws IOException { - final String json = new String(body, StandardCharsets.UTF_8); - Message message = Message.fromJson(json); - try { - this.queueMessages.put(message); - System.out.println("Receiving Message " + message); - } catch (InterruptedException e) { - if (message.getType() == MessageType.REPORT) - throw new RuntimeException("Error on sending message"); - else { - // TODO LOGGING EXCEPTION - } - } finally { - getChannel().basicAck(envelope.getDeliveryTag(), false); - } - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java deleted file mode 100644 index 5ca79f3cc..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java +++ /dev/null @@ -1,136 +0,0 @@ - -package eu.dnetlib.message; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.TimeoutException; - -import com.rabbitmq.client.Channel; -import com.rabbitmq.client.Connection; -import com.rabbitmq.client.ConnectionFactory; - -public class MessageManager { - - private final String messageHost; - - private final String username; - - private final String password; - - private Connection connection; - - private final Map channels = new HashMap<>(); - - private boolean durable; - - private boolean autodelete; - - private final LinkedBlockingQueue queueMessages; - - public MessageManager( - String messageHost, - String username, - String password, - final LinkedBlockingQueue queueMessages) { - this.queueMessages = queueMessages; - this.messageHost = messageHost; - this.username = username; - this.password = password; - } - - public MessageManager( - String messageHost, - String username, - String password, - boolean durable, - boolean autodelete, - final LinkedBlockingQueue queueMessages) { - this.queueMessages = queueMessages; - this.messageHost = messageHost; - this.username = username; - this.password = password; - - this.durable = durable; - this.autodelete = autodelete; - } - - private Connection createConnection() throws IOException, TimeoutException { - ConnectionFactory factory = new ConnectionFactory(); - factory.setHost(this.messageHost); - factory.setUsername(this.username); - factory.setPassword(this.password); - return factory.newConnection(); - } - - private Channel createChannel( - final Connection connection, - final String queueName, - final boolean durable, - final boolean autodelete) - throws Exception { - Map args = new HashMap<>(); - args.put("x-message-ttl", 10000); - Channel channel = connection.createChannel(); - channel.queueDeclare(queueName, durable, false, this.autodelete, args); - return channel; - } - - private Channel getOrCreateChannel(final String queueName, boolean durable, boolean autodelete) - throws Exception { - if (channels.containsKey(queueName)) { - return channels.get(queueName); - } - - if (this.connection == null) { - this.connection = createConnection(); - } - channels.put(queueName, createChannel(this.connection, queueName, durable, autodelete)); - return channels.get(queueName); - } - - public void close() throws IOException { - channels - .values() - .forEach( - ch -> { - try { - ch.close(); - } catch (Exception e) { - // TODO LOG - } - }); - - this.connection.close(); - } - - public boolean sendMessage(final Message message, String queueName) throws Exception { - try { - Channel channel = getOrCreateChannel(queueName, this.durable, this.autodelete); - channel.basicPublish("", queueName, null, message.toString().getBytes()); - return true; - } catch (Throwable e) { - throw new RuntimeException(e); - } - } - - public boolean sendMessage( - final Message message, String queueName, boolean durable_var, boolean autodelete_var) - throws Exception { - try { - Channel channel = getOrCreateChannel(queueName, durable_var, autodelete_var); - channel.basicPublish("", queueName, null, message.toString().getBytes()); - return true; - } catch (Throwable e) { - throw new RuntimeException(e); - } - } - - public void startConsumingMessage( - final String queueName, final boolean durable, final boolean autodelete) throws Exception { - - Channel channel = createChannel(createConnection(), queueName, durable, autodelete); - channel.basicConsume(queueName, false, new MessageConsumer(channel, queueMessages)); - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java deleted file mode 100644 index 72cbda252..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java +++ /dev/null @@ -1,6 +0,0 @@ - -package eu.dnetlib.message; - -public enum MessageType { - ONGOING, REPORT -} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java deleted file mode 100644 index cb4d0ab50..000000000 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java +++ /dev/null @@ -1,16 +0,0 @@ - -package eu.dnetlib.dhp.model.mdstore; - -import static org.junit.jupiter.api.Assertions.assertTrue; - -import org.junit.jupiter.api.Test; - -public class MetadataRecordTest { - - @Test - public void getTimestamp() { - - MetadataRecord r = new MetadataRecord(); - assertTrue(r.getDateOfCollection() > 0); - } -} diff --git a/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java b/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java deleted file mode 100644 index 442f7b5c2..000000000 --- a/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java +++ /dev/null @@ -1,51 +0,0 @@ - -package eu.dnetlib.message; - -import static org.junit.jupiter.api.Assertions.*; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - -import org.junit.jupiter.api.Test; - -public class MessageTest { - - @Test - public void fromJsonTest() throws IOException { - Message m = new Message(); - m.setWorkflowId("wId"); - m.setType(MessageType.ONGOING); - m.setJobName("Collection"); - Map body = new HashMap<>(); - body.put("parsedItem", "300"); - body.put("ExecutionTime", "30s"); - - m.setBody(body); - System.out.println("m = " + m); - Message m1 = Message.fromJson(m.toString()); - assertEquals(m1.getWorkflowId(), m.getWorkflowId()); - assertEquals(m1.getType(), m.getType()); - assertEquals(m1.getJobName(), m.getJobName()); - - assertNotNull(m1.getBody()); - m1.getBody().keySet().forEach(it -> assertEquals(m1.getBody().get(it), m.getBody().get(it))); - assertEquals(m1.getJobName(), m.getJobName()); - } - - @Test - public void toStringTest() { - final String expectedJson = "{\"workflowId\":\"wId\",\"jobName\":\"Collection\",\"type\":\"ONGOING\",\"body\":{\"ExecutionTime\":\"30s\",\"parsedItem\":\"300\"}}"; - Message m = new Message(); - m.setWorkflowId("wId"); - m.setType(MessageType.ONGOING); - m.setJobName("Collection"); - Map body = new HashMap<>(); - body.put("parsedItem", "300"); - body.put("ExecutionTime", "30s"); - - m.setBody(body); - - assertEquals(expectedJson, m.toString()); - } -} diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index 73efeabb4..8e2a169f2 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -67,6 +67,11 @@ guava + + commons-codec + commons-codec + + diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java index d58576019..85f24bc0b 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java @@ -3,17 +3,19 @@ package eu.dnetlib.dhp.schema.common; import static com.google.common.base.Preconditions.checkArgument; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; import java.text.ParseException; -import java.text.SimpleDateFormat; import java.time.Instant; import java.time.format.DateTimeFormatter; -import java.time.temporal.TemporalAccessor; import java.util.Date; import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.function.Function; +import org.apache.commons.codec.binary.Hex; import org.apache.commons.lang3.StringUtils; import com.google.common.collect.Maps; @@ -252,13 +254,6 @@ public class ModelSupport { .setRelation("isRelatedTo") .setRelType("resultResult") .setSubReltype("relationship")); - relationInverseMap - .put( - "resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse() - .setInverse("hasAmongTopNSimilarDocuments") - .setRelation("isAmongTopNSimilarDocuments") - .setRelType("resultResult") - .setSubReltype("similarity")); relationInverseMap .put( "resultResult_supplement_isSupplementTo", new RelationInverse() @@ -482,6 +477,20 @@ public class ModelSupport { return ((OafEntity) t).getId(); } + public static String md5(final String s) { + try { + final MessageDigest md = MessageDigest.getInstance("MD5"); + md.update(s.getBytes(StandardCharsets.UTF_8)); + return new String(Hex.encodeHex(md.digest())); + } catch (final NoSuchAlgorithmException e) { + throw new IllegalStateException(e); + } + } + + public static String generateIdentifier(final String originalId, final String nsPrefix) { + return String.format("%s::%s", nsPrefix, md5(originalId)); + } + public static String oldest(String dateA, String dateB) throws ParseException { if (StringUtils.isBlank(dateA)) { diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/MetadataRecord.java similarity index 76% rename from dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java rename to dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/MetadataRecord.java index ce65e710f..8277e1469 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/MetadataRecord.java @@ -1,11 +1,13 @@ -package eu.dnetlib.dhp.model.mdstore; +package eu.dnetlib.dhp.schema.mdstore; import java.io.Serializable; -import eu.dnetlib.dhp.utils.DHPUtils; +import eu.dnetlib.dhp.schema.common.ModelSupport; -/** This class models a record inside the new Metadata store collection on HDFS * */ +/** + * This class models a record in a Metadata store collection on HDFS + */ public class MetadataRecord implements Serializable { /** The D-Net Identifier associated to the record */ @@ -26,13 +28,13 @@ public class MetadataRecord implements Serializable { private String body; /** the date when the record has been stored */ - private long dateOfCollection; + private Long dateOfCollection; /** the date when the record has been stored */ - private long dateOfTransformation; + private Long dateOfTransformation; public MetadataRecord() { - this.dateOfCollection = System.currentTimeMillis(); + } public MetadataRecord( @@ -40,14 +42,14 @@ public class MetadataRecord implements Serializable { String encoding, Provenance provenance, String body, - long dateOfCollection) { + Long dateOfCollection) { this.originalId = originalId; this.encoding = encoding; this.provenance = provenance; this.body = body; this.dateOfCollection = dateOfCollection; - this.id = DHPUtils.generateIdentifier(originalId, this.provenance.getNsPrefix()); + this.id = ModelSupport.generateIdentifier(originalId, this.provenance.getNsPrefix()); } public String getId() { @@ -90,19 +92,19 @@ public class MetadataRecord implements Serializable { this.body = body; } - public long getDateOfCollection() { + public Long getDateOfCollection() { return dateOfCollection; } - public void setDateOfCollection(long dateOfCollection) { + public void setDateOfCollection(Long dateOfCollection) { this.dateOfCollection = dateOfCollection; } - public long getDateOfTransformation() { + public Long getDateOfTransformation() { return dateOfTransformation; } - public void setDateOfTransformation(long dateOfTransformation) { + public void setDateOfTransformation(Long dateOfTransformation) { this.dateOfTransformation = dateOfTransformation; } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/Provenance.java similarity index 96% rename from dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java rename to dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/Provenance.java index 556535022..8af58f628 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/Provenance.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.model.mdstore; +package eu.dnetlib.dhp.schema.mdstore; import java.io.Serializable; diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala index 371efc3a7..defcb6a28 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala @@ -15,11 +15,11 @@ object OafUtils { } - def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = { + def generateDataInfo(trust: String = "0.9", invisible: Boolean = false): DataInfo = { val di = new DataInfo di.setDeletedbyinference(false) di.setInferred(false) - di.setInvisible(false) + di.setInvisible(invisible) di.setTrust(trust) di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions")) di diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java index f5b9bf028..98baa1efc 100644 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.schema.oaf; import static org.junit.jupiter.api.Assertions.*; +import java.time.format.DateTimeParseException; import java.util.Arrays; import java.util.List; @@ -94,7 +95,7 @@ public class MergeTest { @Test public void mergeRelationTestParseException() { - assertThrows(IllegalArgumentException.class, () -> { + assertThrows(DateTimeParseException.class, () -> { Relation a = createRel(true, "2016-04-05"); Relation b = createRel(true, "2016-04-05"); a.mergeFrom(b); diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java index bab4377bd..0052026d4 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java @@ -5,12 +5,12 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; import java.io.IOException; -import java.util.Objects; import java.util.Optional; import java.util.function.BiFunction; import java.util.function.Function; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; @@ -194,7 +194,7 @@ public class PromoteActionPayloadForGraphTableJob { SerializableSupplier> mergeRowWithActionPayloadAndGetFn = MergeAndGet.functionFor(strategy); SerializableSupplier> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy); SerializableSupplier zeroFn = zeroFn(rowClazz); - SerializableSupplier> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSource; + SerializableSupplier> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSourceAndTarget; Dataset joinedAndMerged = PromoteActionPayloadFunctions .joinGraphTableWithActionPayloadAndMerge( @@ -238,12 +238,13 @@ public class PromoteActionPayloadForGraphTableJob { } } - private static Function isNotZeroFnUsingIdOrSource() { + private static Function isNotZeroFnUsingIdOrSourceAndTarget() { return t -> { if (isSubClass(t, Relation.class)) { - return Objects.nonNull(((Relation) t).getSource()); + final Relation rel = (Relation) t; + return StringUtils.isNotBlank(rel.getSource()) && StringUtils.isNotBlank(rel.getTarget()); } - return Objects.nonNull(((OafEntity) t).getId()); + return StringUtils.isNotBlank(((OafEntity) t).getId()); }; } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java index 56c8dd05a..d799c646b 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java @@ -112,6 +112,7 @@ public class PromoteActionPayloadFunctions { Class rowClazz) { TypedColumn aggregator = new TableAggregator<>(zeroFn, mergeAndGetFn, isNotZeroFn, rowClazz).toColumn(); return rowDS + .filter((FilterFunction) o -> isNotZeroFn.get().apply(o)) .groupByKey((MapFunction) x -> rowIdFn.get().apply(x), Encoders.STRING()) .agg(aggregator) .map((MapFunction, G>) Tuple2::_2, Encoders.kryo(rowClazz)); diff --git a/dhp-workflows/dhp-aggregation/README.md b/dhp-workflows/dhp-aggregation/README.md index 02583b443..5ed6a82d7 100644 --- a/dhp-workflows/dhp-aggregation/README.md +++ b/dhp-workflows/dhp-aggregation/README.md @@ -1,29 +1,27 @@ Description of the Module -------------------------- -This module defines a **collector worker application** that runs on Hadoop. +This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records. +Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure +the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping +of each MDStore. -It is responsible for harvesting metadata using different plugins. +## Metadata collection -The collector worker uses a message queue to inform the progress -of the harvesting action (using a message queue for sending **ONGOING** messages) furthermore, -It gives, at the end of the job, some information about the status -of the collection i.e Number of records collected(using a message queue for sending **REPORT** messages). +The **metadata collection workflow** is responsible for harvesting metadata records from different protocols and responding to +different formats and to store them as on HDFS so that they can be further processed. -To work the collection worker need some parameter like: +### Collector Plugins -* **hdfsPath**: the path where storing the sequential file -* **apidescriptor**: the JSON encoding of the API Descriptor -* **namenode**: the Name Node URI -* **userHDFS**: the user wich create the hdfs seq file -* **rabbitUser**: the user to connect with RabbitMq for messaging -* **rabbitPassWord**: the password to connect with RabbitMq for messaging -* **rabbitHost**: the host of the RabbitMq server -* **rabbitOngoingQueue**: the name of the ongoing queue -* **rabbitReportQueue**: the name of the report queue -* **workflowId**: the identifier of the dnet Workflow +Different protocols are managed by dedicated Collector plugins, i.e. java programs implementing a defined interface: -##Plugins -* OAI Plugin +```eu.dnetlib.dhp.collection.plugin.CollectorPlugin``` + +The list of the supported plugins: + +* OAI Plugin: collects from OAI-PMH compatible endpoints +* MDStore plugin: collects from a given D-Net MetadataStore, (identified by moogodb URI, dbName, MDStoreID) +* MDStore dump plugin: collects from an MDStore dump stored on the HDFS location indicated by the `path` parameter + +# Transformation Plugins +TODO -## Usage -TODO \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index cf0fa0efe..580c1e0f3 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -7,10 +7,44 @@ 1.2.4-SNAPSHOT dhp-aggregation - - + + + + net.alchim31.maven + scala-maven-plugin + ${net.alchim31.maven.version} + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + ${scala.version} + + + + + + + + org.apache.httpcomponents + httpclient + + org.apache.spark spark-core_2.11 @@ -24,15 +58,9 @@ eu.dnetlib.dhp dhp-common ${project.version} - - - com.sun.xml.bind - jaxb-core - - - - + + eu.dnetlib.dhp dhp-schemas ${project.version} @@ -58,6 +86,11 @@ jaxen + + org.json + json + + org.apache.commons @@ -78,8 +111,11 @@ commons-compress - + + org.mongodb + mongo-java-driver + - + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala new file mode 100644 index 000000000..8df203283 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala @@ -0,0 +1,86 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import org.apache.commons.io.IOUtils +import org.apache.http.client.methods.{HttpGet, HttpPost, HttpRequestBase, HttpUriRequest} +import org.apache.http.entity.StringEntity +import org.apache.http.impl.client.HttpClients + +import java.io.IOException + +abstract class AbstractRestClient extends Iterator[String]{ + + var buffer: List[String] = List() + var current_index:Int = 0 + + var scroll_value: Option[String] = None + + var complete:Boolean = false + + + def extractInfo(input: String): Unit + + protected def getBufferData(): Unit + + + def doHTTPGETRequest(url:String): String = { + val httpGet = new HttpGet(url) + doHTTPRequest(httpGet) + + } + + def doHTTPPOSTRequest(url:String, json:String): String = { + val httpPost = new HttpPost(url) + if (json != null) { + val entity = new StringEntity(json) + httpPost.setEntity(entity) + httpPost.setHeader("Accept", "application/json") + httpPost.setHeader("Content-type", "application/json") + } + doHTTPRequest(httpPost) + } + + def hasNext: Boolean = { + buffer.nonEmpty && current_index < buffer.size + } + + + override def next(): String = { + val next_item:String = buffer(current_index) + current_index = current_index + 1 + if (current_index == buffer.size) + getBufferData() + next_item + } + + + + + private def doHTTPRequest[A <: HttpUriRequest](r: A) :String ={ + val client = HttpClients.createDefault + var tries = 4 + try { + while (tries > 0) { + + println(s"requesting ${r.getURI}") + val response = client.execute(r) + println(s"get response with status${response.getStatusLine.getStatusCode}") + if (response.getStatusLine.getStatusCode > 400) { + tries -= 1 + } + else + return IOUtils.toString(response.getEntity.getContent) + } + "" + } catch { + case e: Throwable => + throw new RuntimeException("Error on executing request ", e) + } finally try client.close() + catch { + case e: IOException => + throw new RuntimeException("Unable to close client ", e) + } + } + + getBufferData() + +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala new file mode 100644 index 000000000..36ec9e8c3 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala @@ -0,0 +1,31 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import org.json4s.{DefaultFormats, JValue} +import org.json4s.jackson.JsonMethods.{compact, parse, render} + +class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient { + + override def extractInfo(input: String): Unit = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(input) + buffer = (json \ "data").extract[List[JValue]].map(s => compact(render(s))) + val next_url = (json \ "links" \ "next").extractOrElse[String](null) + scroll_value = if (next_url != null && next_url.nonEmpty) Some(next_url) else None + if (scroll_value.isEmpty) + complete = true + current_index = 0 + } + + def get_url():String ={ + val to = if (until> 0) s"$until" else "*" + s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20$to]" + + } + + override def getBufferData(): Unit = { + if (!complete) { + val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(get_url()) + extractInfo(response) + } + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala new file mode 100644 index 000000000..1776a4ad6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala @@ -0,0 +1,482 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import com.fasterxml.jackson.databind.ObjectMapper +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup +import eu.dnetlib.dhp.schema.action.AtomicAction +import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OafMapperUtils, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset} +import eu.dnetlib.dhp.utils.DHPUtils +import org.apache.commons.lang3.StringUtils +import org.json4s.DefaultFormats +import org.json4s.JsonAST.{JField, JObject, JString} +import org.json4s.jackson.JsonMethods.parse + +import java.nio.charset.CodingErrorAction +import java.text.SimpleDateFormat +import java.time.LocalDate +import java.time.format.DateTimeFormatter +import java.util.{Date, Locale} +import java.util.regex.Pattern +import scala.collection.JavaConverters._ +import scala.io.{Codec, Source} + + + +case class DataciteType(doi:String,timestamp:Long,isActive:Boolean, json:String ){} + +case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {} + +case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {} + +case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {} + +case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {} + +case class DescriptionType(descriptionType: Option[String], description: Option[String]) {} + +case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {} + +case class DateType(date: Option[String], dateType: Option[String]) {} + +case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {} + +object DataciteToOAFTransformation { + + implicit val codec: Codec = Codec("UTF-8") + codec.onMalformedInput(CodingErrorAction.REPLACE) + codec.onUnmappableCharacter(CodingErrorAction.REPLACE) + + + + private val PID_VOCABULARY = "dnet:pid_types" + val COBJ_VOCABULARY = "dnet:publication_resource" + val RESULT_VOCABULARY = "dnet:result_typologies" + val ACCESS_MODE_VOCABULARY = "dnet:access_modes" + val DOI_CLASS = "doi" + + val TITLE_SCHEME = "dnet:dataCite_title" + val SUBJ_CLASS = "keywords" + val SUBJ_SCHEME = "dnet:subject_classification_typologies" + + val j_filter:List[String] = { + val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString + s.lines.toList + } + + val mapper = new ObjectMapper() + val unknown_repository: HostedByMapType = HostedByMapType("openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18", "Unknown Repository", "Unknown Repository", Some(1.0F)) + + val dataInfo: DataInfo = generateDataInfo("0.9") + val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue("openaire____::9e3be59865b2c1c335d32dae2fe7b254", "Datacite") + + val hostedByMap: Map[String, HostedByMapType] = { + val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(s) + json.extract[Map[String, HostedByMapType]] + } + + val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH) + val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN) + + val funder_regex:List[(Pattern, String)] = List( + (Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE),"40|corda__h2020::"), + (Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE),"40|corda_______::") + + ) + + val Date_regex: List[Pattern] = List( + //Y-M-D + Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE), + //M-D-Y + Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE), + //D-M-Y + Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE), + //Y + Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE) + ) + + + def filter_json(json:String):Boolean = { + j_filter.exists(f => json.contains(f)) + } + + def toActionSet(item:Oaf) :(String, String) = { + val mapper = new ObjectMapper() + + item match { + case dataset: OafDataset => + val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset] + a.setClazz(classOf[OafDataset]) + a.setPayload(dataset) + (dataset.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case publication: Publication => + val a: AtomicAction[Publication] = new AtomicAction[Publication] + a.setClazz(classOf[Publication]) + a.setPayload(publication) + (publication.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case software: Software => + val a: AtomicAction[Software] = new AtomicAction[Software] + a.setClazz(classOf[Software]) + a.setPayload(software) + (software.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case orp: OtherResearchProduct => + val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct] + a.setClazz(classOf[OtherResearchProduct]) + a.setPayload(orp) + (orp.getClass.getCanonicalName, mapper.writeValueAsString(a)) + + case relation: Relation => + val a: AtomicAction[Relation] = new AtomicAction[Relation] + a.setClazz(classOf[Relation]) + a.setPayload(relation) + (relation.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case _ => + null + } + + } + + + + + def embargo_end(embargo_end_date: String): Boolean = { + val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]")) + val td = LocalDate.now() + td.isAfter(dt) + } + + + def extract_date(input: String): Option[String] = { + val d = Date_regex.map(pattern => { + val matcher = pattern.matcher(input) + if (matcher.find()) + matcher.group(0) + else + null + } + ).find(s => s != null) + + if (d.isDefined) { + val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get + try { + return Some(LocalDate.parse(a_date, df_en).toString) + } catch { + case _: Throwable => try { + return Some(LocalDate.parse(a_date, df_it).toString) + } catch { + case _: Throwable => + return None + } + } + } + d + } + + def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies:VocabularyGroup): (Qualifier, Qualifier) = { + if (resourceType != null && resourceType.nonEmpty) { + val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, resourceType) + if (typeQualifier != null) + return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid)) + } + if (schemaOrg != null && schemaOrg.nonEmpty) { + val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, schemaOrg) + if (typeQualifier != null) + return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid)) + + } + if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) { + val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, resourceTypeGeneral) + if (typeQualifier != null) + return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid)) + + } + null + } + + + def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies:VocabularyGroup): Result = { + val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies) + if (typeQualifiers == null) + return null + val i = new Instance + i.setInstancetype(typeQualifiers._1) + typeQualifiers._2.getClassname match { + case "dataset" => + val r = new OafDataset + r.setInstance(List(i).asJava) + return r + case "publication" => + val r = new Publication + r.setInstance(List(i).asJava) + return r + case "software" => + val r = new Software + r.setInstance(List(i).asJava) + return r + case "other" => + val r = new OtherResearchProduct + r.setInstance(List(i).asJava) + return r + } + null + } + + + def available_date(input: String): Boolean = { + + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(input) + val l: List[String] = for { + JObject(dates) <- json \\ "dates" + JField("dateType", JString(dateTypes)) <- dates + } yield dateTypes + + l.exists(p => p.equalsIgnoreCase("available")) + + } + + + def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = { + OafMapperUtils.structuredProperty(dt, q, null) + } + + def generateRelation(sourceId:String, targetId:String, relClass:String, cf:KeyValue, di:DataInfo) :Relation = { + + val r = new Relation + r.setSource(sourceId) + r.setTarget(targetId) + r.setRelType("resultProject") + r.setRelClass(relClass) + r.setSubRelType("outcome") + r.setCollectedfrom(List(cf).asJava) + r.setDataInfo(di) + r + + + } + + def get_projectRelation(awardUri:String, sourceId:String):List[Relation] = { + val match_pattern = funder_regex.find(s =>s._1.matcher(awardUri).find()) + + if (match_pattern.isDefined) { + val m =match_pattern.get._1 + val p = match_pattern.get._2 + val grantId = m.matcher(awardUri).replaceAll("$2") + val targetId = s"$p${DHPUtils.md5(grantId)}" + List( + generateRelation(sourceId, targetId,"isProducedBy", DATACITE_COLLECTED_FROM, dataInfo), + generateRelation(targetId, sourceId,"produces", DATACITE_COLLECTED_FROM, dataInfo) + ) + } + else + List() + + } + + + def generateOAF(input:String,ts:Long, dateOfCollection:Long, vocabularies: VocabularyGroup):List[Oaf] = { + if (filter_json(input)) + return List() + + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json = parse(input) + + val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null) + val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null) + val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null) + + val doi = (json \ "attributes" \ "doi").extract[String] + if (doi.isEmpty) + return List() + + //Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies + val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies) + if (result == null) + return List() + + + val doi_q = vocabularies.getSynonymAsQualifier(PID_VOCABULARY, "doi") + val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo) + result.setPid(List(pid).asJava) + result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true)) + result.setOriginalId(List(doi).asJava) + + val d = new Date(dateOfCollection*1000) + val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US) + + + result.setDateofcollection(ISO8601FORMAT.format(d)) + result.setDateoftransformation(ISO8601FORMAT.format(ts)) + result.setDataInfo(dataInfo) + + val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List()) + + + val authors = creators.zipWithIndex.map { case (c, idx) => + val a = new Author + a.setFullname(c.name.orNull) + a.setName(c.givenName.orNull) + a.setSurname(c.familyName.orNull) + if (c.nameIdentifiers!= null&& c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) { + a.setPid(c.nameIdentifiers.get.map(ni => { + val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(PID_VOCABULARY, ni.nameIdentifierScheme.get.toLowerCase()) else null + if (ni.nameIdentifier!= null && ni.nameIdentifier.isDefined) { + OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo) + } + else + null + + } + ) + .asJava) + } + if (c.affiliation.isDefined) + a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava) + a.setRank(idx + 1) + a + } + + + + + val titles:List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List()) + + result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => { + if (t.titleType.isEmpty) { + OafMapperUtils.structuredProperty(t.title.get, "main title", "main title", TITLE_SCHEME, TITLE_SCHEME, null) + } else { + OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, TITLE_SCHEME, TITLE_SCHEME, null) + } + }).asJava) + + if(authors==null || authors.isEmpty || !authors.exists(a => a !=null)) + return List() + result.setAuthor(authors.asJava) + + val dates = (json \\ "dates").extract[List[DateType]] + val publication_year = (json \\ "publicationYear").extractOrElse[String](null) + + val i_date = dates + .filter(d => d.date.isDefined && d.dateType.isDefined) + .find(d => d.dateType.get.equalsIgnoreCase("issued")) + .map(d => extract_date(d.date.get)) + val a_date: Option[String] = dates + .filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available")) + .map(d => extract_date(d.date.get)) + .find(d => d != null && d.isDefined) + .map(d => d.get) + + if (a_date.isDefined) { + result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null)) + } + if (i_date.isDefined && i_date.get.isDefined) { + result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) + result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) + } + else if (publication_year != null) { + result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) + result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) + } + + + result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined) + .map(d => (extract_date(d.date.get), d.dateType.get)) + .filter(d => d._1.isDefined) + .map(d => (d._1.get, vocabularies.getTermAsQualifier("dnet:dataCite_date", d._2.toLowerCase()))) + .filter(d => d._2 != null) + .map(d => generateOAFDate(d._1, d._2)).asJava) + + val subjects = (json \\ "subjects").extract[List[SubjectType]] + + result.setSubject(subjects.filter(s => s.subject.nonEmpty) + .map(s => + OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, SUBJ_SCHEME, SUBJ_SCHEME, null) + ).asJava) + + + result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) + + val descriptions = (json \\ "descriptions").extract[List[DescriptionType]] + + result.setDescription( + descriptions + .filter(d => d.description.isDefined). + map(d => + OafMapperUtils.field(d.description.get, null) + ).filter(s => s!=null).asJava) + + + val publisher = (json \\ "publisher").extractOrElse[String](null) + if (publisher != null) + result.setPublisher(OafMapperUtils.field(publisher, null)) + + + val language: String = (json \\ "language").extractOrElse[String](null) + + if (language != null) + result.setLanguage(vocabularies.getSynonymAsQualifier("dnet:languages", language)) + + + val instance = result.getInstance().get(0) + + val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String] + + val accessRights:List[String] = for { + JObject(rightsList) <- json \\ "rightsList" + JField("rightsUri", JString(rightsUri)) <- rightsList + } yield rightsUri + + val aRights: Option[Qualifier] = accessRights.map(r => { + vocabularies.getSynonymAsQualifier(ACCESS_MODE_VOCABULARY, r) + }).find(q => q != null) + + + val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.qualifier("UNKNOWN", "not available", ACCESS_MODE_VOCABULARY, ACCESS_MODE_VOCABULARY) + + if (client.isDefined) { + val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository) + instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name)) + instance.setCollectedfrom(DATACITE_COLLECTED_FROM) + instance.setUrl(List(s"https://dx.doi.org/$doi").asJava) +// instance.setAccessright(access_rights_qualifier) + + //'http') and matches(., '.*(/licenses|/publicdomain|unlicense.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*')]"> + val license = accessRights + .find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*")) + if (license.isDefined) + instance.setLicense(OafMapperUtils.field(license.get, null)) + } + + + val awardUris:List[String] = for { + JObject(fundingReferences) <- json \\ "fundingReferences" + JField("awardUri", JString(awardUri)) <- fundingReferences + } yield awardUri + + val relations:List[Relation] =awardUris.flatMap(a=> get_projectRelation(a, result.getId)).filter(r => r!= null) + + if (relations!= null && relations.nonEmpty) { + List(result):::relations + } + else + List(result) + } + + def generateDataInfo(trust: String): DataInfo = { + val di = new DataInfo + di.setDeletedbyinference(false) + di.setInferred(false) + di.setInvisible(false) + di.setTrust(trust) + di.setProvenanceaction(OafMapperUtils.qualifier("sysimport:actionset", "sysimport:actionset", "dnet:provenanceActions", "dnet:provenanceActions")) + di + } + + def generateDSId(input: String): String = { + val b = StringUtils.substringBefore(input, "::") + val a = StringUtils.substringAfter(input, "::") + s"10|$b::${DHPUtils.md5(a)}" + } + + +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala new file mode 100644 index 000000000..9f0d25735 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala @@ -0,0 +1,41 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.schema.oaf.Oaf +import org.apache.hadoop.io.Text +import org.apache.hadoop.io.compress.GzipCodec +import org.apache.hadoop.mapred.SequenceFileOutputFormat +import org.apache.spark.SparkConf +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +import scala.io.Source + +object ExportActionSetJobNode { + + val log: Logger = LoggerFactory.getLogger(ExportActionSetJobNode.getClass) + + def main(args: Array[String]): Unit = { + val conf = new SparkConf + val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json")).mkString) + parser.parseArgument(args) + val master = parser.get("master") + val sourcePath = parser.get("sourcePath") + val targetPath = parser.get("targetPath") + + val spark: SparkSession = SparkSession.builder().config(conf) + .appName(ExportActionSetJobNode.getClass.getSimpleName) + .master(master) + .getOrCreate() + implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING) + + spark.read.load(sourcePath).as[Oaf] + .map(o =>DataciteToOAFTransformation.toActionSet(o)) + .filter(o => o!= null) + .rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec]) + + + } + +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala new file mode 100644 index 000000000..168ad218a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala @@ -0,0 +1,48 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup +import eu.dnetlib.dhp.schema.mdstore.MetadataRecord +import eu.dnetlib.dhp.schema.oaf.Oaf +import eu.dnetlib.dhp.utils.ISLookupClientFactory +import org.apache.spark.SparkConf +import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +import scala.io.Source + +object GenerateDataciteDatasetSpark { + + val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass) + + def main(args: Array[String]): Unit = { + val conf = new SparkConf + val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString) + parser.parseArgument(args) + val master = parser.get("master") + val sourcePath = parser.get("sourcePath") + val targetPath = parser.get("targetPath") + val isLookupUrl: String = parser.get("isLookupUrl") + log.info("isLookupUrl: {}", isLookupUrl) + + val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl) + val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService) + log.info(s"vocabulary size is ${vocabularies.getTerms("dnet:languages").size()}") + val spark: SparkSession = SparkSession.builder().config(conf) + .appName(GenerateDataciteDatasetSpark.getClass.getSimpleName) + .master(master) + .getOrCreate() + + implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord] + + implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + + import spark.implicits._ + + spark.read.load(sourcePath).as[DataciteType] + .filter(d => d.isActive) + .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies)) + .filter(d => d != null) + .write.mode(SaveMode.Overwrite).save(targetPath) + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala new file mode 100644 index 000000000..6cec4ea34 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala @@ -0,0 +1,181 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import eu.dnetlib.dhp.actionmanager.datacite.DataciteToOAFTransformation.df_it +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path} +import org.apache.hadoop.hdfs.DistributedFileSystem +import org.apache.hadoop.io.{IntWritable, SequenceFile, Text} +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.expressions.Aggregator +import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession} +import org.json4s.DefaultFormats +import org.json4s.jackson.JsonMethods.parse +import org.apache.spark.sql.functions.max +import org.slf4j.{Logger, LoggerFactory} + +import java.time.format.DateTimeFormatter._ +import java.time.{LocalDate, LocalDateTime, ZoneOffset} +import scala.io.Source + +object ImportDatacite { + + val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass) + + + def convertAPIStringToDataciteItem(input: String): DataciteType = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(input) + val doi = (json \ "attributes" \ "doi").extract[String].toLowerCase + + val isActive = (json \ "attributes" \ "isActive").extract[Boolean] + + val timestamp_string = (json \ "attributes" \ "updated").extract[String] + val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME) + DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, isActive = isActive, json = input) + + } + + + def main(args: Array[String]): Unit = { + + val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString) + parser.parseArgument(args) + val master = parser.get("master") + + val hdfsuri = parser.get("namenode") + log.info(s"namenode is $hdfsuri") + + val targetPath = parser.get("targetPath") + log.info(s"targetPath is $targetPath") + + val dataciteDump = parser.get("dataciteDumpPath") + log.info(s"dataciteDump is $dataciteDump") + + val hdfsTargetPath = new Path(targetPath) + log.info(s"hdfsTargetPath is $hdfsTargetPath") + + + val spkipImport = parser.get("skipImport") + log.info(s"skipImport is $spkipImport") + + val spark: SparkSession = SparkSession.builder() + .appName(ImportDatacite.getClass.getSimpleName) + .master(master) + .getOrCreate() + + // ====== Init HDFS File System Object + val conf = new Configuration + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri) + + // Because of Maven + conf.set("fs.hdfs.impl", classOf[DistributedFileSystem].getName) + conf.set("fs.file.impl", classOf[LocalFileSystem].getName) + val sc: SparkContext = spark.sparkContext + sc.setLogLevel("ERROR") + + import spark.implicits._ + + + val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable { + + override def zero: DataciteType = null + + override def reduce(a: DataciteType, b: DataciteType): DataciteType = { + if (b == null) + return a + if (a == null) + return b + if (a.timestamp > b.timestamp) { + return a + } + b + } + + override def merge(a: DataciteType, b: DataciteType): DataciteType = { + reduce(a, b) + } + + override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]] + + override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]] + + override def finish(reduction: DataciteType): DataciteType = reduction + } + + val dump: Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType] + val ts = dump.select(max("timestamp")).first().getLong(0) + + println(s"last Timestamp is $ts") + + val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf) + + println(s"Imported from Datacite API $cnt documents") + + if (cnt > 0) { + + val inputRdd: RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text]) + .map(s => s._2.toString) + .map(s => convertAPIStringToDataciteItem(s)) + spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset") + + val ds: Dataset[DataciteType] = spark.read.load(s"${targetPath}_dataset").as[DataciteType] + + dump + .union(ds) + .groupByKey(_.doi) + .agg(dataciteAggregator.toColumn) + .map(s => s._2) + .repartition(4000) + .write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated") + + val fs = FileSystem.get(sc.hadoopConfiguration) + fs.delete(new Path(s"$dataciteDump"), true) + fs.rename(new Path(s"${dataciteDump}_updated"), new Path(s"$dataciteDump")) + } + } + + private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration): Long = { + var from:Long = timestamp * 1000 + val delta:Long = 50000000L + var client: DataciteAPIImporter = null + val now :Long =System.currentTimeMillis() + var i = 0 + try { + val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text])) + try { + var start: Long = System.currentTimeMillis + while (from < now) { + client = new DataciteAPIImporter(from, 1000, from + delta) + var end: Long = 0 + val key: IntWritable = new IntWritable(i) + val value: Text = new Text + while (client.hasNext) { + key.set({ + i += 1; + i - 1 + }) + value.set(client.next()) + writer.append(key, value) + writer.hflush() + if (i % 1000 == 0) { + end = System.currentTimeMillis + val time = (end - start) / 1000.0F + println(s"Imported $i in $time seconds") + start = System.currentTimeMillis + } + } + println(s"updating from value: $from -> ${from+delta}") + from = from + delta + } + } catch { + case e: Throwable => + println("Error", e) + } finally if (writer != null) writer.close() + } + i + } + +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorPluginErrorLogList.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorPluginErrorLogList.java deleted file mode 100644 index 9d3f88265..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorPluginErrorLogList.java +++ /dev/null @@ -1,20 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.httpconnector; - -import java.util.LinkedList; - -public class CollectorPluginErrorLogList extends LinkedList { - - private static final long serialVersionUID = -6925786561303289704L; - - @Override - public String toString() { - String log = new String(); - int index = 0; - for (String errorMessage : this) { - log += String.format("Retry #%s: %s / ", index++, errorMessage); - } - return log; - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorServiceException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorServiceException.java deleted file mode 100644 index 9167d97b4..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorServiceException.java +++ /dev/null @@ -1,20 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.httpconnector; - -public class CollectorServiceException extends Exception { - - private static final long serialVersionUID = 7523999812098059764L; - - public CollectorServiceException(String string) { - super(string); - } - - public CollectorServiceException(String string, Throwable exception) { - super(string, exception); - } - - public CollectorServiceException(Throwable exception) { - super(exception); - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnector.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnector.java deleted file mode 100644 index e20518b55..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnector.java +++ /dev/null @@ -1,240 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.httpconnector; - -import java.io.IOException; -import java.io.InputStream; -import java.net.*; -import java.security.GeneralSecurityException; -import java.security.cert.X509Certificate; -import java.util.List; -import java.util.Map; - -import javax.net.ssl.HttpsURLConnection; -import javax.net.ssl.SSLContext; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.math.NumberUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -/** - * @author jochen, michele, andrea - */ -public class HttpConnector { - - private static final Log log = LogFactory.getLog(HttpConnector.class); - - private int maxNumberOfRetry = 6; - private int defaultDelay = 120; // seconds - private int readTimeOut = 120; // seconds - - private String responseType = null; - - private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; - - public HttpConnector() { - CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); - } - - /** - * Given the URL returns the content via HTTP GET - * - * @param requestUrl the URL - * @return the content of the downloaded resource - * @throws CollectorServiceException when retrying more than maxNumberOfRetry times - */ - public String getInputSource(final String requestUrl) throws CollectorServiceException { - return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); - } - - /** - * Given the URL returns the content as a stream via HTTP GET - * - * @param requestUrl the URL - * @return the content of the downloaded resource as InputStream - * @throws CollectorServiceException when retrying more than maxNumberOfRetry times - */ - public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorServiceException { - return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - } - - private String attemptDownlaodAsString(final String requestUrl, final int retryNumber, - final CollectorPluginErrorLogList errorList) - throws CollectorServiceException { - try { - InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - try { - return IOUtils.toString(s); - } catch (IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList); - } finally { - IOUtils.closeQuietly(s); - } - } catch (InterruptedException e) { - throw new CollectorServiceException(e); - } - } - - private InputStream attemptDownload(final String requestUrl, final int retryNumber, - final CollectorPluginErrorLogList errorList) - throws CollectorServiceException { - - if (retryNumber > maxNumberOfRetry) { - throw new CollectorServiceException("Max number of retries exceeded. Cause: \n " + errorList); - } - - log.debug("Downloading " + requestUrl + " - try: " + retryNumber); - try { - InputStream input = null; - - try { - final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); - urlConn.setInstanceFollowRedirects(false); - urlConn.setReadTimeout(readTimeOut * 1000); - urlConn.addRequestProperty("User-Agent", userAgent); - - if (log.isDebugEnabled()) { - logHeaderFields(urlConn); - } - - int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); - if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) { - log.warn("waiting and repeating request after " + retryAfter + " sec."); - Thread.sleep(retryAfter * 1000); - errorList.add("503 Service Unavailable"); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else if ((urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM) - || (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP)) { - final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); - log.debug("The requested url has been moved to " + newUrl); - errorList - .add( - String - .format( - "%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(), - newUrl)); - urlConn.disconnect(); - return attemptDownload(newUrl, retryNumber + 1, errorList); - } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) { - log - .error( - String - .format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - Thread.sleep(defaultDelay * 1000); - errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else { - input = urlConn.getInputStream(); - responseType = urlConn.getContentType(); - return input; - } - } catch (IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } - } catch (InterruptedException e) { - throw new CollectorServiceException(e); - } - } - - private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { - log.debug("StatusCode: " + urlConn.getResponseMessage()); - - for (Map.Entry> e : urlConn.getHeaderFields().entrySet()) { - if (e.getKey() != null) { - for (String v : e.getValue()) { - log.debug(" key: " + e.getKey() + " - value: " + v); - } - } - } - } - - private int obtainRetryAfter(final Map> headerMap) { - for (String key : headerMap.keySet()) { - if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0) - && NumberUtils.isCreatable(headerMap.get(key).get(0))) { - return Integer - .parseInt(headerMap.get(key).get(0)) + 10; - } - } - return -1; - } - - private String obtainNewLocation(final Map> headerMap) throws CollectorServiceException { - for (String key : headerMap.keySet()) { - if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) { - return headerMap.get(key).get(0); - } - } - throw new CollectorServiceException("The requested url has been MOVED, but 'location' param is MISSING"); - } - - /** - * register for https scheme; this is a workaround and not intended for the use in trusted environments - */ - public void initTrustManager() { - final X509TrustManager tm = new X509TrustManager() { - - @Override - public void checkClientTrusted(final X509Certificate[] xcs, final String string) { - } - - @Override - public void checkServerTrusted(final X509Certificate[] xcs, final String string) { - } - - @Override - public X509Certificate[] getAcceptedIssuers() { - return null; - } - }; - try { - final SSLContext ctx = SSLContext.getInstance("TLS"); - ctx.init(null, new TrustManager[] { - tm - }, null); - HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); - } catch (GeneralSecurityException e) { - log.fatal(e); - throw new IllegalStateException(e); - } - } - - public int getMaxNumberOfRetry() { - return maxNumberOfRetry; - } - - public void setMaxNumberOfRetry(final int maxNumberOfRetry) { - this.maxNumberOfRetry = maxNumberOfRetry; - } - - public int getDefaultDelay() { - return defaultDelay; - } - - public void setDefaultDelay(final int defaultDelay) { - this.defaultDelay = defaultDelay; - } - - public int getReadTimeOut() { - return readTimeOut; - } - - public void setReadTimeOut(final int readTimeOut) { - this.readTimeOut = readTimeOut; - } - - public String getResponseType() { - return responseType; - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java index 9dac34a15..cad6b94e1 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java @@ -17,8 +17,8 @@ import org.apache.hadoop.fs.Path; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.collection.HttpConnector2; /** * Applies the parsing of a csv file and writes the Serialization of it in hdfs @@ -74,7 +74,7 @@ public class ReadCSV implements Closeable { throws Exception { this.conf = new Configuration(); this.conf.set("fs.defaultFS", hdfsNameNode); - HttpConnector httpConnector = new HttpConnector(); + HttpConnector2 httpConnector = new HttpConnector2(); FileSystem fileSystem = FileSystem.get(this.conf); Path hdfsWritePath = new Path(hdfsPath); FSDataOutputStream fsDataOutputStream = null; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java index 23b58f2a0..fc3b38ac5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java @@ -14,13 +14,12 @@ import org.apache.hadoop.fs.Path; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.collection.HttpConnector2; /** * Applies the parsing of an excel file and writes the Serialization of it in hdfs */ - public class ReadExcel implements Closeable { private static final Log log = LogFactory.getLog(ReadCSV.class); private final Configuration conf; @@ -72,7 +71,7 @@ public class ReadExcel implements Closeable { throws Exception { this.conf = new Configuration(); this.conf.set("fs.defaultFS", hdfsNameNode); - HttpConnector httpConnector = new HttpConnector(); + HttpConnector2 httpConnector = new HttpConnector2(); FileSystem fileSystem = FileSystem.get(this.conf); Path hdfsWritePath = new Path(hdfsPath); FSDataOutputStream fsDataOutputStream = null; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationCounter.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationCounter.java new file mode 100644 index 000000000..bf2fd22cb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationCounter.java @@ -0,0 +1,45 @@ + +package eu.dnetlib.dhp.aggregation.common; + +import java.io.Serializable; + +import org.apache.spark.util.LongAccumulator; + +public class AggregationCounter implements Serializable { + private LongAccumulator totalItems; + private LongAccumulator errorItems; + private LongAccumulator processedItems; + + public AggregationCounter() { + } + + public AggregationCounter(LongAccumulator totalItems, LongAccumulator errorItems, LongAccumulator processedItems) { + this.totalItems = totalItems; + this.errorItems = errorItems; + this.processedItems = processedItems; + } + + public LongAccumulator getTotalItems() { + return totalItems; + } + + public void setTotalItems(LongAccumulator totalItems) { + this.totalItems = totalItems; + } + + public LongAccumulator getErrorItems() { + return errorItems; + } + + public void setErrorItems(LongAccumulator errorItems) { + this.errorItems = errorItems; + } + + public LongAccumulator getProcessedItems() { + return processedItems; + } + + public void setProcessedItems(LongAccumulator processedItems) { + this.processedItems = processedItems; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregatorReport.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregatorReport.java new file mode 100644 index 000000000..c822a6723 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregatorReport.java @@ -0,0 +1,47 @@ + +package eu.dnetlib.dhp.aggregation.common; + +import java.io.Closeable; +import java.io.IOException; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Objects; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.gson.Gson; + +import eu.dnetlib.dhp.message.MessageSender; +import eu.dnetlib.dhp.utils.DHPUtils; + +public class AggregatorReport extends LinkedHashMap implements Closeable { + + private static final Logger log = LoggerFactory.getLogger(AggregatorReport.class); + + private MessageSender messageSender; + + public AggregatorReport() { + } + + public AggregatorReport(MessageSender messageSender) throws IOException { + this.messageSender = messageSender; + } + + public void ongoing(Long current, Long total) { + messageSender.sendMessage(current, total); + } + + @Override + public void close() throws IOException { + if (Objects.nonNull(messageSender)) { + log.info("closing report: "); + this.forEach((k, v) -> log.info("{} - {}", k, v)); + + Map m = new HashMap<>(); + m.put(getClass().getSimpleName().toLowerCase(), DHPUtils.MAPPER.writeValueAsString(values())); + messageSender.sendReport(m); + } + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReporterCallback.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReporterCallback.java new file mode 100644 index 000000000..b289b6e07 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReporterCallback.java @@ -0,0 +1,10 @@ + +package eu.dnetlib.dhp.aggregation.common; + +public interface ReporterCallback { + + Long getCurrent(); + + Long getTotal(); + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReportingJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReportingJob.java new file mode 100644 index 000000000..791226034 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReportingJob.java @@ -0,0 +1,41 @@ + +package eu.dnetlib.dhp.aggregation.common; + +import java.util.TimerTask; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; + +public abstract class ReportingJob { + + /** + * Frequency (seconds) for sending ongoing messages to report the collection task advancement + */ + public static final int ONGOING_REPORT_FREQUENCY = 5; + + /** + * Initial delay (seconds) for sending ongoing messages to report the collection task advancement + */ + public static final int INITIAL_DELAY = 2; + + private ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor(); + + protected final AggregatorReport report; + + public ReportingJob(AggregatorReport report) { + this.report = report; + } + + protected void schedule(final ReporterCallback callback) { + executor.scheduleAtFixedRate(new TimerTask() { + @Override + public void run() { + report.ongoing(callback.getCurrent(), callback.getTotal()); + } + }, INITIAL_DELAY, ONGOING_REPORT_FREQUENCY, TimeUnit.SECONDS); + } + + protected void shutdown() { + executor.shutdown(); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java new file mode 100644 index 000000000..09f3ffd63 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java @@ -0,0 +1,136 @@ + +package eu.dnetlib.dhp.aggregation.mdstore; + +import static eu.dnetlib.dhp.common.Constants.*; +import static eu.dnetlib.dhp.utils.DHPUtils.*; + +import java.net.URI; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.rest.DNetRestClient; + +public class MDStoreActionNode { + private static final Logger log = LoggerFactory.getLogger(MDStoreActionNode.class); + + enum MDAction { + NEW_VERSION, ROLLBACK, COMMIT, READ_LOCK, READ_UNLOCK + } + + public static String NEW_VERSION_URI = "%s/mdstore/%s/newVersion"; + + public static final String COMMIT_VERSION_URL = "%s/version/%s/commit/%s"; + public static final String ROLLBACK_VERSION_URL = "%s/version/%s/abort"; + + public static final String READ_LOCK_URL = "%s/mdstore/%s/startReading"; + public static final String READ_UNLOCK_URL = "%s/version/%s/endReading"; + + private static final String MDSTOREVERSIONPARAM = "mdStoreVersion"; + private static final String MDSTOREREADLOCKPARAM = "mdStoreReadLockVersion"; + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser( + IOUtils + .toString( + MDStoreActionNode.class + .getResourceAsStream( + "/eu/dnetlib/dhp/collection/mdstore_action_parameters.json"))); + argumentParser.parseArgument(args); + + log.info("Java Xmx: {}m", Runtime.getRuntime().maxMemory() / (1024 * 1024)); + + final MDAction action = MDAction.valueOf(argumentParser.get("action")); + log.info("Current action is {}", action); + + final String mdStoreManagerURI = argumentParser.get("mdStoreManagerURI"); + log.info("mdStoreManagerURI is {}", mdStoreManagerURI); + + switch (action) { + case NEW_VERSION: { + final String mdStoreID = argumentParser.get("mdStoreID"); + if (StringUtils.isBlank(mdStoreID)) { + throw new IllegalArgumentException("missing or empty argument mdStoreId"); + } + final MDStoreVersion currentVersion = DNetRestClient + .doGET(String.format(NEW_VERSION_URI, mdStoreManagerURI, mdStoreID), MDStoreVersion.class); + populateOOZIEEnv(MDSTOREVERSIONPARAM, MAPPER.writeValueAsString(currentVersion)); + break; + } + case COMMIT: { + + final String hdfsuri = argumentParser.get("namenode"); + if (StringUtils.isBlank(hdfsuri)) { + throw new IllegalArgumentException("missing or empty argument namenode"); + } + final String mdStoreVersion_params = argumentParser.get("mdStoreVersion"); + final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class); + + if (StringUtils.isBlank(mdStoreVersion.getId())) { + throw new IllegalArgumentException( + "invalid MDStoreVersion value current is " + mdStoreVersion_params); + } + Path hdfstoreSizepath = new Path(mdStoreVersion.getHdfsPath() + MDSTORE_SIZE_PATH); + + try ( + FileSystem fs = FileSystem.get(URI.create(hdfsuri), getHadoopConfiguration(hdfsuri)); + FSDataInputStream inputStream = fs.open(hdfstoreSizepath)) { + + final Long mdStoreSize = Long.parseLong(IOUtils.toString(inputStream)); + + fs.create(hdfstoreSizepath); + DNetRestClient + .doGET( + String.format(COMMIT_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId(), mdStoreSize)); + } + + break; + } + case ROLLBACK: { + final String mdStoreVersion_params = argumentParser.get("mdStoreVersion"); + final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class); + + if (StringUtils.isBlank(mdStoreVersion.getId())) { + throw new IllegalArgumentException( + "invalid MDStoreVersion value current is " + mdStoreVersion_params); + } + DNetRestClient.doGET(String.format(ROLLBACK_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId())); + break; + } + + case READ_LOCK: { + final String mdStoreID = argumentParser.get("mdStoreID"); + if (StringUtils.isBlank(mdStoreID)) { + throw new IllegalArgumentException("missing or empty argument mdStoreId"); + } + final MDStoreVersion currentVersion = DNetRestClient + .doGET(String.format(READ_LOCK_URL, mdStoreManagerURI, mdStoreID), MDStoreVersion.class); + populateOOZIEEnv(MDSTOREREADLOCKPARAM, MAPPER.writeValueAsString(currentVersion)); + break; + } + case READ_UNLOCK: { + final String mdStoreVersion_params = argumentParser.get("readMDStoreId"); + final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class); + + if (StringUtils.isBlank(mdStoreVersion.getId())) { + throw new IllegalArgumentException( + "invalid MDStoreVersion value current is " + mdStoreVersion_params); + } + DNetRestClient.doGET(String.format(READ_UNLOCK_URL, mdStoreManagerURI, mdStoreVersion.getId())); + break; + } + + default: + throw new IllegalArgumentException("invalid action"); + } + + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorException.java similarity index 50% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorException.java index f40962c21..144d297e6 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorException.java @@ -1,16 +1,16 @@ -package eu.dnetlib.dhp.collection.worker; +package eu.dnetlib.dhp.collection; -public class DnetCollectorException extends Exception { +public class CollectorException extends Exception { /** */ private static final long serialVersionUID = -290723075076039757L; - public DnetCollectorException() { + public CollectorException() { super(); } - public DnetCollectorException( + public CollectorException( final String message, final Throwable cause, final boolean enableSuppression, @@ -18,15 +18,15 @@ public class DnetCollectorException extends Exception { super(message, cause, enableSuppression, writableStackTrace); } - public DnetCollectorException(final String message, final Throwable cause) { + public CollectorException(final String message, final Throwable cause) { super(message, cause); } - public DnetCollectorException(final String message) { + public CollectorException(final String message) { super(message); } - public DnetCollectorException(final Throwable cause) { + public CollectorException(final Throwable cause) { super(cause); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java new file mode 100644 index 000000000..f9d7d7dae --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java @@ -0,0 +1,134 @@ + +package eu.dnetlib.dhp.collection; + +import static eu.dnetlib.dhp.common.Constants.SEQUENCE_FILE_NAME; + +import java.io.IOException; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.DeflateCodec; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; +import eu.dnetlib.dhp.aggregation.common.ReporterCallback; +import eu.dnetlib.dhp.aggregation.common.ReportingJob; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin; +import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin; +import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; +import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin; + +public class CollectorWorker extends ReportingJob { + + private static final Logger log = LoggerFactory.getLogger(CollectorWorker.class); + + private final ApiDescriptor api; + + private final FileSystem fileSystem; + + private final MDStoreVersion mdStoreVersion; + + private final HttpClientParams clientParams; + + public CollectorWorker( + final ApiDescriptor api, + final FileSystem fileSystem, + final MDStoreVersion mdStoreVersion, + final HttpClientParams clientParams, + final AggregatorReport report) { + super(report); + this.api = api; + this.fileSystem = fileSystem; + this.mdStoreVersion = mdStoreVersion; + this.clientParams = clientParams; + } + + public void collect() throws UnknownCollectorPluginException, CollectorException, IOException { + + final String outputPath = mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME; + log.info("outputPath path is {}", outputPath); + + final CollectorPlugin plugin = getCollectorPlugin(); + final AtomicInteger counter = new AtomicInteger(0); + + scheduleReport(counter); + + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + fileSystem.getConf(), + SequenceFile.Writer.file(new Path(outputPath)), + SequenceFile.Writer.keyClass(IntWritable.class), + SequenceFile.Writer.valueClass(Text.class), + SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) { + final IntWritable key = new IntWritable(counter.get()); + final Text value = new Text(); + plugin + .collect(api, report) + .forEach( + content -> { + key.set(counter.getAndIncrement()); + value.set(content); + try { + writer.append(key, value); + } catch (Throwable e) { + throw new RuntimeException(e); + } + }); + } catch (Throwable e) { + report.put(e.getClass().getName(), e.getMessage()); + throw new CollectorException(e); + } finally { + shutdown(); + report.ongoing(counter.longValue(), counter.longValue()); + } + } + + private void scheduleReport(AtomicInteger counter) { + schedule(new ReporterCallback() { + @Override + public Long getCurrent() { + return counter.longValue(); + } + + @Override + public Long getTotal() { + return null; + } + }); + } + + private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException { + + switch (CollectorPlugin.NAME.valueOf(api.getProtocol())) { + case oai: + return new OaiCollectorPlugin(clientParams); + case rest_json2xml: + return new RestCollectorPlugin(clientParams); + case other: + final CollectorPlugin.NAME.OTHER_NAME plugin = Optional + .ofNullable(api.getParams().get("other_plugin_type")) + .map(CollectorPlugin.NAME.OTHER_NAME::valueOf) + .get(); + + switch (plugin) { + case mdstore_mongodb_dump: + return new MongoDbDumpCollectorPlugin(fileSystem); + case mdstore_mongodb: + return new MDStoreCollectorPlugin(); + default: + throw new UnknownCollectorPluginException("plugin is not managed: " + plugin); + } + default: + throw new UnknownCollectorPluginException("protocol is not managed: " + api.getProtocol()); + } + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerApplication.java new file mode 100644 index 000000000..2c5640499 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerApplication.java @@ -0,0 +1,135 @@ + +package eu.dnetlib.dhp.collection; + +import static eu.dnetlib.dhp.common.Constants.*; +import static eu.dnetlib.dhp.utils.DHPUtils.*; + +import java.io.IOException; +import java.util.Optional; + +import org.apache.commons.cli.ParseException; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.fs.FileSystem; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.message.MessageSender; + +/** + * CollectorWorkerApplication is the main class responsible to start the metadata collection process, storing the outcomes + * into HDFS. This application will be executed on the hadoop cluster, where invoked in the context of the metadata collection + * oozie workflow, it will receive all the input parameters necessary to instantiate the specific collection plugin and the + * relative specific configurations + * + * @author Sandro La Bruzzo, Claudio Atzori + */ +public class CollectorWorkerApplication { + + private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class); + + private FileSystem fileSystem; + + public CollectorWorkerApplication(FileSystem fileSystem) { + this.fileSystem = fileSystem; + } + + /** + * @param args + */ + public static void main(final String[] args) + throws ParseException, IOException, UnknownCollectorPluginException, CollectorException { + + final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser( + IOUtils + .toString( + CollectorWorkerApplication.class + .getResourceAsStream( + "/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json"))); + argumentParser.parseArgument(args); + + log.info("Java Xmx: {}m", Runtime.getRuntime().maxMemory() / (1024 * 1024)); + + final String hdfsuri = argumentParser.get("namenode"); + log.info("hdfsURI is {}", hdfsuri); + + final String apiDescriptor = argumentParser.get("apidescriptor"); + log.info("apiDescriptor is {}", apiDescriptor); + + final String mdStoreVersion = argumentParser.get("mdStoreVersion"); + log.info("mdStoreVersion is {}", mdStoreVersion); + + final String dnetMessageManagerURL = argumentParser.get(DNET_MESSAGE_MGR_URL); + log.info("dnetMessageManagerURL is {}", dnetMessageManagerURL); + + final String workflowId = argumentParser.get("workflowId"); + log.info("workflowId is {}", workflowId); + + final HttpClientParams clientParams = getClientParams(argumentParser); + + final ApiDescriptor api = MAPPER.readValue(apiDescriptor, ApiDescriptor.class); + final FileSystem fileSystem = FileSystem.get(getHadoopConfiguration(hdfsuri)); + + new CollectorWorkerApplication(fileSystem) + .run(mdStoreVersion, clientParams, api, dnetMessageManagerURL, workflowId); + } + + protected void run(String mdStoreVersion, HttpClientParams clientParams, ApiDescriptor api, + String dnetMessageManagerURL, String workflowId) + throws IOException, CollectorException, UnknownCollectorPluginException { + + final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class); + final MessageSender ms = new MessageSender(dnetMessageManagerURL, workflowId); + + try (AggregatorReport report = new AggregatorReport(ms)) { + new CollectorWorker(api, fileSystem, currentVersion, clientParams, report).collect(); + } + } + + private static HttpClientParams getClientParams(ArgumentApplicationParser argumentParser) { + final HttpClientParams clientParams = new HttpClientParams(); + clientParams + .setMaxNumberOfRetry( + Optional + .ofNullable(argumentParser.get(MAX_NUMBER_OF_RETRY)) + .map(Integer::parseInt) + .orElse(HttpClientParams._maxNumberOfRetry)); + log.info("maxNumberOfRetry is {}", clientParams.getMaxNumberOfRetry()); + + clientParams + .setRequestDelay( + Optional + .ofNullable(argumentParser.get(REQUEST_DELAY)) + .map(Integer::parseInt) + .orElse(HttpClientParams._requestDelay)); + log.info("requestDelay is {}", clientParams.getRequestDelay()); + + clientParams + .setRetryDelay( + Optional + .ofNullable(argumentParser.get(RETRY_DELAY)) + .map(Integer::parseInt) + .orElse(HttpClientParams._retryDelay)); + log.info("retryDelay is {}", clientParams.getRetryDelay()); + + clientParams + .setConnectTimeOut( + Optional + .ofNullable(argumentParser.get(CONNECT_TIMEOUT)) + .map(Integer::parseInt) + .orElse(HttpClientParams._connectTimeOut)); + log.info("connectTimeOut is {}", clientParams.getConnectTimeOut()); + + clientParams + .setReadTimeOut( + Optional + .ofNullable(argumentParser.get(READ_TIMEOUT)) + .map(Integer::parseInt) + .orElse(HttpClientParams._readTimeOut)); + log.info("readTimeOut is {}", clientParams.getReadTimeOut()); + return clientParams; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java index 861ae5201..043da31f9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java @@ -1,28 +1,26 @@ package eu.dnetlib.dhp.collection; +import static eu.dnetlib.dhp.common.Constants.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.utils.DHPUtils.*; import java.io.ByteArrayInputStream; +import java.io.IOException; import java.nio.charset.StandardCharsets; -import java.util.HashMap; -import java.util.Map; import java.util.Objects; import java.util.Optional; -import org.apache.commons.cli.*; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoder; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SparkSession; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; +import org.apache.spark.sql.expressions.Aggregator; import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; import org.dom4j.Node; @@ -30,19 +28,172 @@ import org.dom4j.io.SAXReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.model.mdstore.Provenance; -import eu.dnetlib.message.Message; -import eu.dnetlib.message.MessageManager; -import eu.dnetlib.message.MessageType; +import eu.dnetlib.dhp.schema.mdstore.MetadataRecord; +import eu.dnetlib.dhp.schema.mdstore.Provenance; +import scala.Tuple2; public class GenerateNativeStoreSparkJob { private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJob.class); + public static void main(String[] args) throws Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenerateNativeStoreSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/collection/generate_native_input_parameters.json"))); + parser.parseArgument(args); + + final String provenanceArgument = parser.get("provenance"); + log.info("Provenance is {}", provenanceArgument); + final Provenance provenance = MAPPER.readValue(provenanceArgument, Provenance.class); + + final String dateOfCollectionArgs = parser.get("dateOfCollection"); + log.info("dateOfCollection is {}", dateOfCollectionArgs); + final Long dateOfCollection = new Long(dateOfCollectionArgs); + + String mdStoreVersion = parser.get("mdStoreVersion"); + log.info("mdStoreVersion is {}", mdStoreVersion); + + final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class); + + String readMdStoreVersionParam = parser.get("readMdStoreVersion"); + log.info("readMdStoreVersion is {}", readMdStoreVersionParam); + + final MDStoreVersion readMdStoreVersion = StringUtils.isBlank(readMdStoreVersionParam) ? null + : MAPPER.readValue(readMdStoreVersionParam, MDStoreVersion.class); + + final String xpath = parser.get("xpath"); + log.info("xpath is {}", xpath); + + final String encoding = parser.get("encoding"); + log.info("encoding is {}", encoding); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> createNativeMDStore( + spark, provenance, dateOfCollection, xpath, encoding, currentVersion, readMdStoreVersion)); + } + + private static void createNativeMDStore(SparkSession spark, + Provenance provenance, + Long dateOfCollection, + String xpath, + String encoding, + MDStoreVersion currentVersion, + MDStoreVersion readVersion) throws IOException { + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + final LongAccumulator totalItems = sc.sc().longAccumulator(CONTENT_TOTALITEMS); + final LongAccumulator invalidRecords = sc.sc().longAccumulator(CONTENT_INVALIDRECORDS); + + final String seqFilePath = currentVersion.getHdfsPath() + SEQUENCE_FILE_NAME; + final JavaRDD nativeStore = sc + .sequenceFile(seqFilePath, IntWritable.class, Text.class) + .map( + item -> parseRecord( + item._2().toString(), + xpath, + encoding, + provenance, + dateOfCollection, + totalItems, + invalidRecords)) + .filter(Objects::nonNull) + .distinct(); + + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mdstore = spark.createDataset(nativeStore.rdd(), encoder); + + final String targetPath = currentVersion.getHdfsPath() + MDSTORE_DATA_PATH; + + if (readVersion != null) { // INCREMENTAL MODE + log.info("updating {} incrementally with {}", targetPath, readVersion.getHdfsPath()); + Dataset currentMdStoreVersion = spark + .read() + .load(readVersion.getHdfsPath() + MDSTORE_DATA_PATH) + .as(encoder); + TypedColumn aggregator = new MDStoreAggregator().toColumn(); + + final Dataset map = currentMdStoreVersion + .union(mdstore) + .groupByKey( + (MapFunction) MetadataRecord::getId, + Encoders.STRING()) + .agg(aggregator) + .map((MapFunction, MetadataRecord>) Tuple2::_2, encoder); + + map.select("id").takeAsList(100).forEach(s -> log.info(s.toString())); + + saveDataset(map, targetPath); + + } else { + saveDataset(mdstore, targetPath); + } + + final Long total = spark.read().load(targetPath).count(); + log.info("collected {} records for datasource '{}'", total, provenance.getDatasourceName()); + + writeHdfsFile( + spark.sparkContext().hadoopConfiguration(), total.toString(), + currentVersion.getHdfsPath() + MDSTORE_SIZE_PATH); + } + + public static class MDStoreAggregator extends Aggregator { + + @Override + public MetadataRecord zero() { + return null; + } + + @Override + public MetadataRecord reduce(MetadataRecord b, MetadataRecord a) { + return getLatestRecord(b, a); + } + + @Override + public MetadataRecord merge(MetadataRecord b, MetadataRecord a) { + return getLatestRecord(b, a); + } + + private MetadataRecord getLatestRecord(MetadataRecord b, MetadataRecord a) { + if (b == null) + return a; + + if (a == null) + return b; + return (a.getDateOfCollection() > b.getDateOfCollection()) ? a : b; + } + + @Override + public MetadataRecord finish(MetadataRecord r) { + return r; + } + + @Override + public Encoder bufferEncoder() { + return Encoders.bean(MetadataRecord.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.bean(MetadataRecord.class); + } + + } + public static MetadataRecord parseRecord( final String input, final String xpath, @@ -64,112 +215,11 @@ public class GenerateNativeStoreSparkJob { invalidRecords.add(1); return null; } - return new MetadataRecord(originalIdentifier, encoding, provenance, input, dateOfCollection); + return new MetadataRecord(originalIdentifier, encoding, provenance, document.asXML(), dateOfCollection); } catch (Throwable e) { - if (invalidRecords != null) - invalidRecords.add(1); - e.printStackTrace(); + invalidRecords.add(1); return null; } } - public static void main(String[] args) throws Exception { - - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - GenerateNativeStoreSparkJob.class - .getResourceAsStream( - "/eu/dnetlib/dhp/collection/collection_input_parameters.json"))); - parser.parseArgument(args); - final ObjectMapper jsonMapper = new ObjectMapper(); - final Provenance provenance = jsonMapper.readValue(parser.get("provenance"), Provenance.class); - final long dateOfCollection = new Long(parser.get("dateOfCollection")); - - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - - final Map ongoingMap = new HashMap<>(); - final Map reportMap = new HashMap<>(); - - final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest")); - - SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - - final JavaPairRDD inputRDD = sc - .sequenceFile(parser.get("input"), IntWritable.class, Text.class); - - final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems"); - final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords"); - - final MessageManager manager = new MessageManager( - parser.get("rabbitHost"), - parser.get("rabbitUser"), - parser.get("rabbitPassword"), - false, - false, - null); - - final JavaRDD mappeRDD = inputRDD - .map( - item -> parseRecord( - item._2().toString(), - parser.get("xpath"), - parser.get("encoding"), - provenance, - dateOfCollection, - totalItems, - invalidRecords)) - .filter(Objects::nonNull) - .distinct(); - - ongoingMap.put("ongoing", "0"); - if (!test) { - manager - .sendMessage( - new Message( - parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap), - parser.get("rabbitOngoingQueue"), - true, - false); - } - - final Encoder encoder = Encoders.bean(MetadataRecord.class); - final Dataset mdstore = spark.createDataset(mappeRDD.rdd(), encoder); - final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords"); - mdStoreRecords.add(mdstore.count()); - ongoingMap.put("ongoing", "" + totalItems.value()); - if (!test) { - manager - .sendMessage( - new Message( - parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap), - parser.get("rabbitOngoingQueue"), - true, - false); - } - mdstore.write().format("parquet").save(parser.get("output")); - reportMap.put("inputItem", "" + totalItems.value()); - reportMap.put("invalidRecords", "" + invalidRecords.value()); - reportMap.put("mdStoreSize", "" + mdStoreRecords.value()); - if (!test) { - manager - .sendMessage( - new Message(parser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), - parser.get("rabbitReportQueue"), - true, - false); - manager.close(); - } - }); - - } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpClientParams.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpClientParams.java new file mode 100644 index 000000000..ab0d5cc02 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpClientParams.java @@ -0,0 +1,94 @@ + +package eu.dnetlib.dhp.collection; + +/** + * Bundles the http connection parameters driving the client behaviour. + */ +public class HttpClientParams { + + // Defaults + public static int _maxNumberOfRetry = 3; + public static int _requestDelay = 0; // milliseconds + public static int _retryDelay = 10; // seconds + public static int _connectTimeOut = 10; // seconds + public static int _readTimeOut = 30; // seconds + + /** + * Maximum number of allowed retires before failing + */ + private int maxNumberOfRetry; + + /** + * Delay between request (Milliseconds) + */ + private int requestDelay; + + /** + * Time to wait after a failure before retrying (Seconds) + */ + private int retryDelay; + + /** + * Connect timeout (Seconds) + */ + private int connectTimeOut; + + /** + * Read timeout (Seconds) + */ + private int readTimeOut; + + public HttpClientParams() { + this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut); + } + + public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut, + int readTimeOut) { + this.maxNumberOfRetry = maxNumberOfRetry; + this.requestDelay = requestDelay; + this.retryDelay = retryDelay; + this.connectTimeOut = connectTimeOut; + this.readTimeOut = readTimeOut; + } + + public int getMaxNumberOfRetry() { + return maxNumberOfRetry; + } + + public void setMaxNumberOfRetry(int maxNumberOfRetry) { + this.maxNumberOfRetry = maxNumberOfRetry; + } + + public int getRequestDelay() { + return requestDelay; + } + + public void setRequestDelay(int requestDelay) { + this.requestDelay = requestDelay; + } + + public int getRetryDelay() { + return retryDelay; + } + + public void setRetryDelay(int retryDelay) { + this.retryDelay = retryDelay; + } + + public void setConnectTimeOut(int connectTimeOut) { + this.connectTimeOut = connectTimeOut; + } + + public int getConnectTimeOut() { + return connectTimeOut; + } + + public int getReadTimeOut() { + return readTimeOut; + } + + public void setReadTimeOut(int readTimeOut) { + this.readTimeOut = readTimeOut; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java new file mode 100644 index 000000000..9d8b8d34b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java @@ -0,0 +1,259 @@ + +package eu.dnetlib.dhp.collection; + +import static eu.dnetlib.dhp.utils.DHPUtils.*; + +import java.io.IOException; +import java.io.InputStream; +import java.net.*; +import java.util.List; +import java.util.Map; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.math.NumberUtils; +import org.apache.http.HttpHeaders; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; + +/** + * Migrated from https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpConnector.java + * + * @author jochen, michele, andrea, alessia, claudio + */ +public class HttpConnector2 { + + private static final Logger log = LoggerFactory.getLogger(HttpConnector2.class); + + private static final String REPORT_PREFIX = "http:"; + + private HttpClientParams clientParams; + + private String responseType = null; + + private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; + + public HttpConnector2() { + this(new HttpClientParams()); + } + + public HttpConnector2(HttpClientParams clientParams) { + this.clientParams = clientParams; + CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); + } + + /** + * @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport) + */ + public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorException { + return IOUtils.toInputStream(getInputSource(requestUrl)); + } + + /** + * @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport) + */ + public String getInputSource(final String requestUrl) throws CollectorException { + return attemptDownloadAsString(requestUrl, 1, new AggregatorReport()); + } + + /** + * Given the URL returns the content via HTTP GET + * + * @param requestUrl the URL + * @param report the list of errors + * @return the content of the downloaded resource + * @throws CollectorException when retrying more than maxNumberOfRetry times + */ + public String getInputSource(final String requestUrl, AggregatorReport report) + throws CollectorException { + return attemptDownloadAsString(requestUrl, 1, report); + } + + private String attemptDownloadAsString(final String requestUrl, final int retryNumber, + final AggregatorReport report) throws CollectorException { + + try (InputStream s = attemptDownload(requestUrl, retryNumber, report)) { + return IOUtils.toString(s); + } catch (IOException e) { + log.error(e.getMessage(), e); + throw new CollectorException(e); + } + } + + private InputStream attemptDownload(final String requestUrl, final int retryNumber, + final AggregatorReport report) throws CollectorException, IOException { + + if (retryNumber > getClientParams().getMaxNumberOfRetry()) { + final String msg = String + .format( + "Max number of retries (%s/%s) exceeded, failing.", + retryNumber, getClientParams().getMaxNumberOfRetry()); + log.error(msg); + throw new CollectorException(msg); + } + + log.info("Request attempt {} [{}]", retryNumber, requestUrl); + + InputStream input = null; + + try { + if (getClientParams().getRequestDelay() > 0) { + backoffAndSleep(getClientParams().getRequestDelay()); + } + final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); + urlConn.setInstanceFollowRedirects(false); + urlConn.setReadTimeout(getClientParams().getReadTimeOut() * 1000); + urlConn.setConnectTimeout(getClientParams().getConnectTimeOut() * 1000); + urlConn.addRequestProperty(HttpHeaders.USER_AGENT, userAgent); + + if (log.isDebugEnabled()) { + logHeaderFields(urlConn); + } + + int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); + if (is2xx(urlConn.getResponseCode())) { + input = urlConn.getInputStream(); + responseType = urlConn.getContentType(); + return input; + } + if (is3xx(urlConn.getResponseCode())) { + // REDIRECTS + final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); + log.info(String.format("The requested url has been moved to %s", newUrl)); + report + .put( + REPORT_PREFIX + urlConn.getResponseCode(), + String.format("Moved to: %s", newUrl)); + urlConn.disconnect(); + if (retryAfter > 0) { + backoffAndSleep(retryAfter); + } + return attemptDownload(newUrl, retryNumber + 1, report); + } + if (is4xx(urlConn.getResponseCode())) { + // CLIENT ERROR, DO NOT RETRY + report + .put( + REPORT_PREFIX + urlConn.getResponseCode(), + String + .format( + "%s error: %s", requestUrl, urlConn.getResponseMessage())); + throw new CollectorException("4xx error: request will not be repeated. " + report); + } + if (is5xx(urlConn.getResponseCode())) { + // SERVER SIDE ERRORS RETRY ONLY on 503 + switch (urlConn.getResponseCode()) { + case HttpURLConnection.HTTP_UNAVAILABLE: + if (retryAfter > 0) { + log + .warn( + requestUrl + " - waiting and repeating request after suggested retry-after " + + retryAfter + " sec."); + backoffAndSleep(retryAfter * 1000); + } else { + log + .warn( + requestUrl + " - waiting and repeating request after default delay of " + + getClientParams().getRetryDelay() + " sec."); + backoffAndSleep(retryNumber * getClientParams().getRetryDelay() * 1000); + } + report.put(REPORT_PREFIX + urlConn.getResponseCode(), requestUrl); + urlConn.disconnect(); + return attemptDownload(requestUrl, retryNumber + 1, report); + default: + report + .put( + REPORT_PREFIX + urlConn.getResponseCode(), + String + .format( + "%s Error: %s", requestUrl, urlConn.getResponseMessage())); + throw new CollectorException(urlConn.getResponseCode() + " error " + report); + } + } + throw new CollectorException( + String + .format( + "Unexpected status code: %s errors: %s", urlConn.getResponseCode(), + MAPPER.writeValueAsString(report))); + } catch (MalformedURLException | UnknownHostException e) { + log.error(e.getMessage(), e); + report.put(e.getClass().getName(), e.getMessage()); + throw new CollectorException(e.getMessage(), e); + } catch (SocketTimeoutException | SocketException e) { + log.error(e.getMessage(), e); + report.put(e.getClass().getName(), e.getMessage()); + backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000); + return attemptDownload(requestUrl, retryNumber + 1, report); + } + } + + private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { + log.debug("StatusCode: " + urlConn.getResponseMessage()); + + for (Map.Entry> e : urlConn.getHeaderFields().entrySet()) { + if (e.getKey() != null) { + for (String v : e.getValue()) { + log.debug(" key: " + e.getKey() + " - value: " + v); + } + } + } + } + + private void backoffAndSleep(int sleepTimeMs) throws CollectorException { + log.info("I'm going to sleep for {}ms", sleepTimeMs); + try { + Thread.sleep(sleepTimeMs); + } catch (InterruptedException e) { + log.error(e.getMessage(), e); + throw new CollectorException(e); + } + } + + private int obtainRetryAfter(final Map> headerMap) { + for (String key : headerMap.keySet()) { + if ((key != null) && key.equalsIgnoreCase(HttpHeaders.RETRY_AFTER) && (headerMap.get(key).size() > 0) + && NumberUtils.isCreatable(headerMap.get(key).get(0))) { + return Integer.parseInt(headerMap.get(key).get(0)) + 10; + } + } + return -1; + } + + private String obtainNewLocation(final Map> headerMap) throws CollectorException { + for (String key : headerMap.keySet()) { + if ((key != null) && key.equalsIgnoreCase(HttpHeaders.LOCATION) && (headerMap.get(key).size() > 0)) { + return headerMap.get(key).get(0); + } + } + throw new CollectorException("The requested url has been MOVED, but 'location' param is MISSING"); + } + + private boolean is2xx(final int statusCode) { + return statusCode >= 200 && statusCode <= 299; + } + + private boolean is4xx(final int statusCode) { + return statusCode >= 400 && statusCode <= 499; + } + + private boolean is3xx(final int statusCode) { + return statusCode >= 300 && statusCode <= 399; + } + + private boolean is5xx(final int statusCode) { + return statusCode >= 500 && statusCode <= 599; + } + + public String getResponseType() { + return responseType; + } + + public HttpClientParams getClientParams() { + return clientParams; + } + + public void setClientParams(HttpClientParams clientParams) { + this.clientParams = clientParams; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java new file mode 100644 index 000000000..da3768a4a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java @@ -0,0 +1,84 @@ + +package eu.dnetlib.dhp.collection; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class JsonUtils { + + private static final Log log = LogFactory.getLog(JsonUtils.class); + + public static final String wrapName = "recordWrap"; + + /** + * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to '' + * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names + * and work-around for the JSON to XML converting of org.json.XML-package. + * + * known bugs: doesn't prevent "key name":" ["sexy name",": penari","erotic dance"], + * + * @param jsonInput + * @return convertedJsonKeynameOutput + */ + public String syntaxConvertJsonKeyNames(String jsonInput) { + + log.trace("before convertJsonKeyNames: " + jsonInput); + // pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml) + // replace ' 's in JSON Namens with '_' + while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":"); + } + + // replace forward-slash (sign '/' ) in JSON Names with '_' + while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":"); + } + + // replace '(' in JSON Names with '' + while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":"); + } + + // replace ')' in JSON Names with '' + while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":"); + } + + // add prefix of startNumbers in JSON Keynames with 'n_' + while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":"); + } + // add prefix of only numbers in JSON Keynames with 'm_' + while (jsonInput.matches(".*\"([0-9]+)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":"); + } + + // replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with '' + while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":"); + } + + // replace ',' in JSON Keynames with '.' to prevent , in xml tagnames. + // while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) { + // jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":"); + // } + + // replace '=' in JSON Keynames with '-' + while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":"); + } + + log.trace("after syntaxConvertJsonKeyNames: " + jsonInput); + return jsonInput; + } + + public String convertToXML(final String jsonRecord) { + String resultXml = ""; + org.json.JSONObject jsonObject = new org.json.JSONObject(syntaxConvertJsonKeyNames(jsonRecord)); + resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element + log.trace("before inputStream: " + resultXml); + resultXml = XmlCleaner.cleanAllEntities(resultXml); + log.trace("after cleaning: " + resultXml); + return resultXml; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/UnknownCollectorPluginException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/UnknownCollectorPluginException.java new file mode 100644 index 000000000..2b0a98e53 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/UnknownCollectorPluginException.java @@ -0,0 +1,32 @@ + +package eu.dnetlib.dhp.collection; + +public class UnknownCollectorPluginException extends Exception { + + /** */ + private static final long serialVersionUID = -290723075076039757L; + + public UnknownCollectorPluginException() { + super(); + } + + public UnknownCollectorPluginException( + final String message, + final Throwable cause, + final boolean enableSuppression, + final boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } + + public UnknownCollectorPluginException(final String message, final Throwable cause) { + super(message, cause); + } + + public UnknownCollectorPluginException(final String message) { + super(message); + } + + public UnknownCollectorPluginException(final Throwable cause) { + super(cause); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/XmlCleaner.java similarity index 99% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/XmlCleaner.java index 44aeb4d02..c674031f6 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/XmlCleaner.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collection.worker.utils; +package eu.dnetlib.dhp.collection; import java.util.HashMap; import java.util.HashSet; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index 4a0c70c45..457f63468 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -3,10 +3,21 @@ package eu.dnetlib.dhp.collection.plugin; import java.util.stream.Stream; -import eu.dnetlib.collector.worker.model.ApiDescriptor; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.collection.CollectorException; public interface CollectorPlugin { - Stream collect(ApiDescriptor api) throws DnetCollectorException; + enum NAME { + oai, other, rest_json2xml; + + public enum OTHER_NAME { + mdstore_mongodb_dump, mdstore_mongodb + } + + } + + Stream collect(ApiDescriptor api, AggregatorReport report) throws CollectorException; + } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MDStoreCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MDStoreCollectorPlugin.java new file mode 100644 index 000000000..549c59720 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MDStoreCollectorPlugin.java @@ -0,0 +1,58 @@ + +package eu.dnetlib.dhp.collection.plugin.mongodb; + +import java.util.Optional; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import org.bson.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.mongodb.client.MongoCollection; + +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.common.MdstoreClient; + +public class MDStoreCollectorPlugin implements CollectorPlugin { + + private static final Logger log = LoggerFactory.getLogger(MDStoreCollectorPlugin.class); + + public static final String MONGODB_DBNAME = "mongodb_dbname"; + public static final String MDSTORE_ID = "mdstore_id"; + + @Override + public Stream collect(ApiDescriptor api, AggregatorReport report) throws CollectorException { + + final String mongoBaseUrl = Optional + .ofNullable(api.getBaseUrl()) + .orElseThrow( + () -> new CollectorException( + "missing mongodb baseUrl, expected in eu.dnetlib.dhp.collection.ApiDescriptor.baseUrl")); + log.info("mongoBaseUrl: {}", mongoBaseUrl); + + final String dbName = Optional + .ofNullable(api.getParams().get(MONGODB_DBNAME)) + .orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MONGODB_DBNAME))); + log.info("dbName: {}", dbName); + + final String mdId = Optional + .ofNullable(api.getParams().get(MDSTORE_ID)) + .orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MDSTORE_ID))); + log.info("mdId: {}", mdId); + + final MdstoreClient client = new MdstoreClient(mongoBaseUrl, dbName); + final MongoCollection mdstore = client.mdStore(mdId); + long size = mdstore.count(); + + return StreamSupport + .stream( + Spliterators.spliterator(mdstore.find().iterator(), size, Spliterator.SIZED), false) + .map(doc -> doc.getString("body")); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbDumpCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbDumpCollectorPlugin.java new file mode 100644 index 000000000..3199af5b7 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbDumpCollectorPlugin.java @@ -0,0 +1,54 @@ + +package eu.dnetlib.dhp.collection.plugin.mongodb; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.util.Optional; +import java.util.stream.Stream; +import java.util.zip.GZIPInputStream; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.utils.DHPUtils; + +public class MongoDbDumpCollectorPlugin implements CollectorPlugin { + + public static final String PATH_PARAM = "path"; + public static final String BODY_JSONPATH = "$.body"; + + public FileSystem fileSystem; + + public MongoDbDumpCollectorPlugin(FileSystem fileSystem) { + this.fileSystem = fileSystem; + } + + @Override + public Stream collect(ApiDescriptor api, AggregatorReport report) throws CollectorException { + + final Path path = Optional + .ofNullable(api.getParams().get("path")) + .map(Path::new) + .orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", PATH_PARAM))); + + try { + if (!fileSystem.exists(path)) { + throw new CollectorException("path does not exist: " + path.toString()); + } + + return new BufferedReader( + new InputStreamReader(new GZIPInputStream(fileSystem.open(path)), Charset.defaultCharset())) + .lines() + .map(s -> DHPUtils.getJPathString(BODY_JSONPATH, s)); + + } catch (IOException e) { + throw new CollectorException(e); + } + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java index 7f71f401d..4600562ca 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java @@ -13,9 +13,11 @@ import com.google.common.base.Splitter; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; -import eu.dnetlib.collector.worker.model.ApiDescriptor; +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.HttpClientParams; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; public class OaiCollectorPlugin implements CollectorPlugin { @@ -26,8 +28,15 @@ public class OaiCollectorPlugin implements CollectorPlugin { private OaiIteratorFactory oaiIteratorFactory; + private HttpClientParams clientParams; + + public OaiCollectorPlugin(HttpClientParams clientParams) { + this.clientParams = clientParams; + } + @Override - public Stream collect(final ApiDescriptor api) throws DnetCollectorException { + public Stream collect(final ApiDescriptor api, final AggregatorReport report) + throws CollectorException { final String baseUrl = api.getBaseUrl(); final String mdFormat = api.getParams().get(FORMAT_PARAM); final String setParam = api.getParams().get(OAI_SET_PARAM); @@ -46,26 +55,26 @@ public class OaiCollectorPlugin implements CollectorPlugin { } if (baseUrl == null || baseUrl.isEmpty()) { - throw new DnetCollectorException("Param 'baseurl' is null or empty"); + throw new CollectorException("Param 'baseurl' is null or empty"); } if (mdFormat == null || mdFormat.isEmpty()) { - throw new DnetCollectorException("Param 'mdFormat' is null or empty"); + throw new CollectorException("Param 'mdFormat' is null or empty"); } if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { - throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate); + throw new CollectorException("Invalid date (YYYY-MM-DD): " + fromDate); } if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { - throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate); + throw new CollectorException("Invalid date (YYYY-MM-DD): " + untilDate); } final Iterator> iters = sets .stream() .map( set -> getOaiIteratorFactory() - .newIterator(baseUrl, mdFormat, set, fromDate, untilDate)) + .newIterator(baseUrl, mdFormat, set, fromDate, untilDate, getClientParams(), report)) .iterator(); return StreamSupport @@ -79,4 +88,12 @@ public class OaiCollectorPlugin implements CollectorPlugin { } return oaiIteratorFactory; } + + public HttpClientParams getClientParams() { + return clientParams; + } + + public void setClientParams(HttpClientParams clientParams) { + this.clientParams = clientParams; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index d61f13fb5..65695fe8e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -1,7 +1,9 @@ package eu.dnetlib.dhp.collection.plugin.oai; +import java.io.IOException; import java.io.StringReader; +import java.io.StringWriter; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.Iterator; @@ -9,24 +11,28 @@ import java.util.Queue; import java.util.concurrent.PriorityBlockingQueue; import org.apache.commons.lang.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.dom4j.Document; import org.dom4j.DocumentException; +import org.dom4j.DocumentHelper; import org.dom4j.Node; +import org.dom4j.io.OutputFormat; import org.dom4j.io.SAXReader; +import org.dom4j.io.XMLWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; -import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner; +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.HttpConnector2; +import eu.dnetlib.dhp.collection.XmlCleaner; public class OaiIterator implements Iterator { - private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on - // 11/24/08 5:02 PM + private static final Logger log = LoggerFactory.getLogger(OaiIterator.class); + + private final static String REPORT_PREFIX = "oai:"; private final Queue queue = new PriorityBlockingQueue<>(); - private final SAXReader reader = new SAXReader(); private final String baseUrl; private final String set; @@ -35,7 +41,8 @@ public class OaiIterator implements Iterator { private final String untilDate; private String token; private boolean started; - private final HttpConnector httpConnector; + private final HttpConnector2 httpConnector; + private AggregatorReport report; public OaiIterator( final String baseUrl, @@ -43,7 +50,8 @@ public class OaiIterator implements Iterator { final String set, final String fromDate, final String untilDate, - final HttpConnector httpConnector) { + final HttpConnector2 httpConnector, + final AggregatorReport report) { this.baseUrl = baseUrl; this.mdFormat = mdFormat; this.set = set; @@ -51,6 +59,7 @@ public class OaiIterator implements Iterator { this.untilDate = untilDate; this.started = false; this.httpConnector = httpConnector; + this.report = report; } private void verifyStarted() { @@ -58,7 +67,7 @@ public class OaiIterator implements Iterator { this.started = true; try { this.token = firstPage(); - } catch (final DnetCollectorException e) { + } catch (final CollectorException e) { throw new RuntimeException(e); } } @@ -80,7 +89,7 @@ public class OaiIterator implements Iterator { while (queue.isEmpty() && token != null && !token.isEmpty()) { try { token = otherPages(token); - } catch (final DnetCollectorException e) { + } catch (final CollectorException e) { throw new RuntimeException(e); } } @@ -92,7 +101,7 @@ public class OaiIterator implements Iterator { public void remove() { } - private String firstPage() throws DnetCollectorException { + private String firstPage() throws CollectorException { try { String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8"); if (set != null && !set.isEmpty()) { @@ -108,7 +117,8 @@ public class OaiIterator implements Iterator { return downloadPage(url); } catch (final UnsupportedEncodingException e) { - throw new DnetCollectorException(e); + report.put(e.getClass().getName(), e.getMessage()); + throw new CollectorException(e); } } @@ -126,32 +136,35 @@ public class OaiIterator implements Iterator { return result.trim(); } - private String otherPages(final String resumptionToken) throws DnetCollectorException { + private String otherPages(final String resumptionToken) throws CollectorException { try { return downloadPage( baseUrl + "?verb=ListRecords&resumptionToken=" + URLEncoder.encode(resumptionToken, "UTF-8")); } catch (final UnsupportedEncodingException e) { - throw new DnetCollectorException(e); + report.put(e.getClass().getName(), e.getMessage()); + throw new CollectorException(e); } } - private String downloadPage(final String url) throws DnetCollectorException { + private String downloadPage(final String url) throws CollectorException { - final String xml = httpConnector.getInputSource(url); + final String xml = httpConnector.getInputSource(url, report); Document doc; try { - doc = reader.read(new StringReader(xml)); + doc = DocumentHelper.parseText(xml); } catch (final DocumentException e) { - log.warn("Error parsing xml, I try to clean it: " + xml, e); + log.warn("Error parsing xml, I try to clean it. {}", e.getMessage()); + report.put(e.getClass().getName(), e.getMessage()); final String cleaned = XmlCleaner.cleanAllEntities(xml); try { - doc = reader.read(new StringReader(cleaned)); + doc = DocumentHelper.parseText(xml); } catch (final DocumentException e1) { final String resumptionToken = extractResumptionToken(xml); if (resumptionToken == null) { - throw new DnetCollectorException("Error parsing cleaned document:" + cleaned, e1); + report.put(e1.getClass().getName(), e1.getMessage()); + throw new CollectorException("Error parsing cleaned document:\n" + cleaned, e1); } return resumptionToken; } @@ -159,19 +172,35 @@ public class OaiIterator implements Iterator { final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']"); if (errorNode != null) { - final String code = errorNode.valueOf("@code"); - if ("noRecordsMatch".equalsIgnoreCase(code.trim())) { - log.warn("noRecordsMatch for oai call: " + url); + final String code = errorNode.valueOf("@code").trim(); + if ("noRecordsMatch".equalsIgnoreCase(code)) { + final String msg = "noRecordsMatch for oai call : " + url; + log.warn(msg); + report.put(REPORT_PREFIX + code, msg); return null; } else { - throw new DnetCollectorException(code + " - " + errorNode.getText()); + final String msg = code + " - " + errorNode.getText(); + report.put(REPORT_PREFIX + "error", msg); + throw new CollectorException(msg); } } for (final Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) { - queue.add(((Node) o).asXML()); + final StringWriter sw = new StringWriter(); + final XMLWriter writer = new XMLWriter(sw, OutputFormat.createPrettyPrint()); + try { + writer.write((Node) o); + queue.add(sw.toString()); + } catch (IOException e) { + report.put(e.getClass().getName(), e.getMessage()); + throw new CollectorException("Error parsing XML record:\n" + ((Node) o).asXML(), e); + } } return doc.valueOf("//*[local-name()='resumptionToken']"); } + + public AggregatorReport getReport() { + return report; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java index 4a6ea7f67..48f6a94c8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java @@ -3,24 +3,28 @@ package eu.dnetlib.dhp.collection.plugin.oai; import java.util.Iterator; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; +import eu.dnetlib.dhp.collection.HttpClientParams; +import eu.dnetlib.dhp.collection.HttpConnector2; public class OaiIteratorFactory { - private HttpConnector httpConnector; + private HttpConnector2 httpConnector; public Iterator newIterator( final String baseUrl, final String mdFormat, final String set, final String fromDate, - final String untilDate) { - return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector()); + final String untilDate, + final HttpClientParams clientParams, + final AggregatorReport report) { + return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector(clientParams), report); } - private HttpConnector getHttpConnector() { + private HttpConnector2 getHttpConnector(HttpClientParams clientParams) { if (httpConnector == null) - httpConnector = new HttpConnector(); + httpConnector = new HttpConnector2(clientParams); return httpConnector; } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java new file mode 100644 index 000000000..e59db143a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java @@ -0,0 +1,105 @@ + +package eu.dnetlib.dhp.collection.plugin.rest; + +import java.util.Optional; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.HttpClientParams; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; + +/** + * TODO: delegate HTTP requests to the common HttpConnector2 implementation. + * + * @author js, Andreas Czerniak + * @date 2020-04-09 + * + */ +public class RestCollectorPlugin implements CollectorPlugin { + + public static final String RESULT_SIZE_VALUE_DEFAULT = "100"; + + private HttpClientParams clientParams; + + public RestCollectorPlugin(HttpClientParams clientParams) { + this.clientParams = clientParams; + } + + @Override + public Stream collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException { + final String baseUrl = api.getBaseUrl(); + + final String resumptionType = api.getParams().get("resumptionType"); + final String resumptionParam = api.getParams().get("resumptionParam"); + final String resumptionXpath = api.getParams().get("resumptionXpath"); + final String resultTotalXpath = api.getParams().get("resultTotalXpath"); + final String resultFormatParam = api.getParams().get("resultFormatParam"); + final String resultFormatValue = api.getParams().get("resultFormatValue"); + final String resultSizeParam = api.getParams().get("resultSizeParam"); + final String queryParams = api.getParams().get("queryParams"); + final String entityXpath = api.getParams().get("entityXpath"); + final String authMethod = api.getParams().get("authMethod"); + final String authToken = api.getParams().get("authToken"); + final String resultSizeValue = Optional + .ofNullable(api.getParams().get("resultSizeValue")) + .filter(StringUtils::isNotBlank) + .orElse(RESULT_SIZE_VALUE_DEFAULT); + + if (StringUtils.isBlank(baseUrl)) { + throw new CollectorException("Param 'baseUrl' is null or empty"); + } + if (StringUtils.isBlank(resumptionType)) { + throw new CollectorException("Param 'resumptionType' is null or empty"); + } + if (StringUtils.isBlank(resumptionParam)) { + throw new CollectorException("Param 'resumptionParam' is null or empty"); + } + if (StringUtils.isBlank(resultFormatValue)) { + throw new CollectorException("Param 'resultFormatValue' is null or empty"); + } + if (StringUtils.isBlank(queryParams)) { + throw new CollectorException("Param 'queryParams' is null or empty"); + } + if (StringUtils.isBlank(entityXpath)) { + throw new CollectorException("Param 'entityXpath' is null or empty"); + } + + final String resultOutputFormat = Optional + .ofNullable(api.getParams().get("resultOutputFormat")) + .map(String::toLowerCase) + .filter(StringUtils::isNotBlank) + .orElse(resultFormatValue.toLowerCase()); + + RestIterator it = new RestIterator( + getClientParams(), + baseUrl, + resumptionType, + resumptionParam, + resumptionXpath, + resultTotalXpath, + resultFormatParam, + resultFormatValue, + resultSizeParam, + resultSizeValue, + queryParams, + entityXpath, + authMethod, + authToken, + resultOutputFormat); + + return StreamSupport + .stream( + Spliterators.spliteratorUnknownSize(it, Spliterator.ORDERED), false); + } + + public HttpClientParams getClientParams() { + return clientParams; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java new file mode 100644 index 000000000..6f606b480 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java @@ -0,0 +1,412 @@ + +package eu.dnetlib.dhp.collection.plugin.rest; + +import java.io.InputStream; +import java.io.StringWriter; +import java.io.UnsupportedEncodingException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.Iterator; +import java.util.Queue; +import java.util.concurrent.PriorityBlockingQueue; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import javax.xml.xpath.*; + +import org.apache.avro.test.http.Http; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.http.HttpHeaders; +import org.apache.http.entity.ContentType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; + +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.HttpClientParams; +import eu.dnetlib.dhp.collection.JsonUtils; + +/** + * log.info(...) equal to log.trace(...) in the application-logs + *

+ * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue + * + * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak + * @date 2020-04-09 + * + */ +public class RestIterator implements Iterator { + + private static final Logger log = LoggerFactory.getLogger(RestIterator.class); + public static final String UTF_8 = "UTF-8"; + + private HttpClientParams clientParams; + + private final String BASIC = "basic"; + + private JsonUtils jsonUtils; + + private String baseUrl; + private String resumptionType; + private String resumptionParam; + private String resultFormatValue; + private String queryParams; + private int resultSizeValue; + private int resumptionInt = 0; // integer resumption token (first record to harvest) + private int resultTotal = -1; + private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest + // or token scanned from results) + private InputStream resultStream; + private Transformer transformer; + private XPath xpath; + private String query; + private XPathExpression xprResultTotalPath; + private XPathExpression xprResumptionPath; + private XPathExpression xprEntity; + private String queryFormat; + private String querySize; + private String authMethod; + private String authToken; + private Queue recordQueue = new PriorityBlockingQueue(); + private int discoverResultSize = 0; + private int pagination = 1; + /* + * While resultFormatValue is added to the request parameter, this is used to say that the results are retrieved in + * json. useful for cases when the target API expects a resultFormatValue != json, but the results are returned in + * json. An example is the EU Open Data Portal API: resultFormatValue=standard, results are in json format. + */ + private String resultOutputFormat; + + /** RestIterator class + * compatible to version 1.3.33 + */ + public RestIterator( + final HttpClientParams clientParams, + final String baseUrl, + final String resumptionType, + final String resumptionParam, + final String resumptionXpath, + final String resultTotalXpath, + final String resultFormatParam, + final String resultFormatValue, + final String resultSizeParam, + final String resultSizeValueStr, + final String queryParams, + final String entityXpath, + final String authMethod, + final String authToken, + final String resultOutputFormat) { + + this.clientParams = clientParams; + this.jsonUtils = new JsonUtils(); + this.baseUrl = baseUrl; + this.resumptionType = resumptionType; + this.resumptionParam = resumptionParam; + this.resultFormatValue = resultFormatValue; + this.resultSizeValue = Integer.valueOf(resultSizeValueStr); + this.queryParams = queryParams; + this.authMethod = authMethod; + this.authToken = authToken; + this.resultOutputFormat = resultOutputFormat; + + queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue + : ""; + querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : ""; + + try { + initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath); + } catch (Exception e) { + throw new IllegalStateException("xml transformation init failed: " + e.getMessage()); + } + initQueue(); + } + + private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath) + throws TransformerConfigurationException, XPathExpressionException { + transformer = TransformerFactory.newInstance().newTransformer(); + transformer.setOutputProperty(OutputKeys.INDENT, "yes"); + transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3"); + xpath = XPathFactory.newInstance().newXPath(); + xprResultTotalPath = xpath.compile(resultTotalXpath); + xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath); + xprEntity = xpath.compile(entityXpath); + } + + private void initQueue() { + query = baseUrl + "?" + queryParams + querySize + queryFormat; + log.info("REST calls starting with " + query); + } + + private void disconnect() { + // TODO close inputstream + } + + /* + * (non-Javadoc) + * @see java.util.Iterator#hasNext() + */ + @Override + public boolean hasNext() { + if (recordQueue.isEmpty() && query.isEmpty()) { + disconnect(); + return false; + } else { + return true; + } + } + + /* + * (non-Javadoc) + * @see java.util.Iterator#next() + */ + @Override + public String next() { + synchronized (recordQueue) { + while (recordQueue.isEmpty() && !query.isEmpty()) { + try { + query = downloadPage(query); + } catch (CollectorException e) { + log.debug("CollectorPlugin.next()-Exception: " + e); + throw new RuntimeException(e); + } + } + return recordQueue.poll(); + } + } + + /* + * download page and return nextQuery + */ + private String downloadPage(String query) throws CollectorException { + String resultJson; + String resultXml = ""; + String nextQuery = ""; + String emptyXml = resultXml + "<" + JsonUtils.wrapName + ">"; + Node resultNode = null; + NodeList nodeList = null; + String qUrlArgument = ""; + int urlOldResumptionSize = 0; + InputStream theHttpInputStream; + + // check if cursor=* is initial set otherwise add it to the queryParam URL + if (resumptionType.equalsIgnoreCase("deep-cursor")) { + log.debug("check resumptionType deep-cursor and check cursor=*?" + query); + if (!query.contains("&cursor=")) { + query += "&cursor=*"; + } + } + + try { + log.info("requestig URL [{}]", query); + + URL qUrl = new URL(query); + log.debug("authMethod :" + authMethod); + if ("bearer".equalsIgnoreCase(this.authMethod)) { + log.trace("authMethod before inputStream: " + resultXml); + HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); + conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken); + conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType()); + conn.setRequestMethod("GET"); + theHttpInputStream = conn.getInputStream(); + } else if (BASIC.equalsIgnoreCase(this.authMethod)) { + log.trace("authMethod before inputStream: " + resultXml); + HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); + conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken); + conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType()); + conn.setRequestMethod("GET"); + theHttpInputStream = conn.getInputStream(); + } else { + theHttpInputStream = qUrl.openStream(); + } + + resultStream = theHttpInputStream; + if ("json".equals(resultOutputFormat)) { + resultJson = IOUtils.toString(resultStream, UTF_8); + resultXml = jsonUtils.convertToXML(resultJson); + resultStream = IOUtils.toInputStream(resultXml, UTF_8); + } + + if (!(emptyXml).equalsIgnoreCase(resultXml)) { + resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE); + nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET); + log.debug("nodeList.length: " + nodeList.getLength()); + for (int i = 0; i < nodeList.getLength(); i++) { + StringWriter sw = new StringWriter(); + transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw)); + String toEnqueue = sw.toString(); + if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) { + log.warn("The following record resulted in empty item for the feeding queue: " + resultXml); + } else { + recordQueue.add(sw.toString()); + } + } + } else { + log.warn("resultXml is equal with emptyXml"); + } + + resumptionInt += resultSizeValue; + + switch (resumptionType.toLowerCase()) { + case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items + resumptionStr = xprResumptionPath.evaluate(resultNode); + break; + + case "count": // begin at one step for all records, iterate over items + resumptionStr = Integer.toString(resumptionInt); + break; + + case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808) + if (resultSizeValue < 2) { + throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2"); + } + qUrlArgument = qUrl.getQuery(); + String[] arrayQUrlArgument = qUrlArgument.split("&"); + for (String arrayUrlArgStr : arrayQUrlArgument) { + if (arrayUrlArgStr.startsWith(resumptionParam)) { + String[] resumptionKeyValue = arrayUrlArgStr.split("="); + if (isInteger(resumptionKeyValue[1])) { + urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]); + log.debug("discover OldResumptionSize from Url (int): " + urlOldResumptionSize); + } else { + log.debug("discover OldResumptionSize from Url (str): " + resumptionKeyValue[1]); + } + } + } + + if (((emptyXml).equalsIgnoreCase(resultXml)) + || ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) { + // resumptionStr = ""; + if (nodeList != null) { + discoverResultSize += nodeList.getLength(); + } + resultTotal = discoverResultSize; + } else { + resumptionStr = Integer.toString(resumptionInt); + resultTotal = resumptionInt + 1; + if (nodeList != null) { + discoverResultSize += nodeList.getLength(); + } + } + log.info("discoverResultSize: {}", discoverResultSize); + break; + + case "pagination": + case "page": // pagination, iterate over page numbers + pagination += 1; + if (nodeList != null) { + discoverResultSize += nodeList.getLength(); + } else { + resultTotal = discoverResultSize; + pagination = discoverResultSize; + } + resumptionInt = pagination; + resumptionStr = Integer.toString(resumptionInt); + break; + + case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in + // solr) + // isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: + // deep-cursor, Param 'resultSizeValue' is less than 2");} + + resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode)); + queryParams = queryParams.replace("&cursor=*", ""); + + // terminating if length of nodeList is 0 + if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) { + resumptionInt += (nodeList.getLength() + 1 - resultSizeValue); + } else { + resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue + // because the iteration is over + // real length and the + // resultSizeValue is added before + // the switch() + } + + discoverResultSize = nodeList.getLength(); + + log + .debug( + "downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams=" + + queryParams + " resumptionLengthIncreased: " + resumptionInt); + + break; + + default: // otherwise: abort + // resultTotal = resumptionInt; + break; + } + + } catch (Exception e) { + log.error(e.getMessage(), e); + throw new IllegalStateException("collection failed: " + e.getMessage()); + } + + try { + if (resultTotal == -1) { + resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode)); + if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) { + resultTotal += 1; + } // to correct the upper bound + log.info("resultTotal was -1 is now: " + resultTotal); + } + } catch (Exception e) { + log.error(e.getMessage(), e); + throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage()); + } + log.debug("resultTotal: " + resultTotal); + log.debug("resInt: " + resumptionInt); + if (resumptionInt <= resultTotal) { + nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + + queryFormat; + } else { + nextQuery = ""; + // if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the + // resumptionInt and prevent a NullPointer Exception at mdStore + } + log.debug("nextQueryUrl: " + nextQuery); + return nextQuery; + + } + + private boolean isInteger(String s) { + boolean isValidInteger = false; + try { + Integer.parseInt(s); + + // s is a valid integer + + isValidInteger = true; + } catch (NumberFormatException ex) { + // s is not an integer + } + + return isValidInteger; + } + + // Method to encode a string value using `UTF-8` encoding scheme + private String encodeValue(String value) { + try { + return URLEncoder.encode(value, StandardCharsets.UTF_8.toString()); + } catch (UnsupportedEncodingException ex) { + throw new RuntimeException(ex.getCause()); + } + } + + public String getResultFormatValue() { + return resultFormatValue; + } + + public String getResultOutputFormat() { + return resultOutputFormat; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java deleted file mode 100644 index e686ad518..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java +++ /dev/null @@ -1,139 +0,0 @@ - -package eu.dnetlib.dhp.collection.worker; - -import java.io.IOException; -import java.net.URI; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.collector.worker.model.ApiDescriptor; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; -import eu.dnetlib.message.Message; -import eu.dnetlib.message.MessageManager; -import eu.dnetlib.message.MessageType; - -public class DnetCollectorWorker { - - private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorker.class); - - private final CollectorPluginFactory collectorPluginFactory; - - private final ArgumentApplicationParser argumentParser; - - private final MessageManager manager; - - public DnetCollectorWorker( - final CollectorPluginFactory collectorPluginFactory, - final ArgumentApplicationParser argumentParser, - final MessageManager manager) - throws DnetCollectorException { - this.collectorPluginFactory = collectorPluginFactory; - this.argumentParser = argumentParser; - this.manager = manager; - } - - public void collect() throws DnetCollectorException { - try { - final ObjectMapper jsonMapper = new ObjectMapper(); - final ApiDescriptor api = jsonMapper.readValue(argumentParser.get("apidescriptor"), ApiDescriptor.class); - - final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol()); - - final String hdfsuri = argumentParser.get("namenode"); - - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", hdfsuri); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - - System.setProperty("HADOOP_USER_NAME", argumentParser.get("userHDFS")); - System.setProperty("hadoop.home.dir", "/"); - // Get the filesystem - HDFS - FileSystem.get(URI.create(hdfsuri), conf); - Path hdfswritepath = new Path(argumentParser.get("hdfsPath")); - - log.info("Created path " + hdfswritepath.toString()); - - final Map ongoingMap = new HashMap<>(); - final Map reportMap = new HashMap<>(); - final AtomicInteger counter = new AtomicInteger(0); - try (SequenceFile.Writer writer = SequenceFile - .createWriter( - conf, - SequenceFile.Writer.file(hdfswritepath), - SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { - final IntWritable key = new IntWritable(counter.get()); - final Text value = new Text(); - plugin - .collect(api) - .forEach( - content -> { - key.set(counter.getAndIncrement()); - value.set(content); - if (counter.get() % 10 == 0) { - try { - ongoingMap.put("ongoing", "" + counter.get()); - log - .debug( - "Sending message: " - + manager - .sendMessage( - new Message( - argumentParser.get("workflowId"), - "Collection", - MessageType.ONGOING, - ongoingMap), - argumentParser.get("rabbitOngoingQueue"), - true, - false)); - } catch (Exception e) { - log.error("Error on sending message ", e); - } - } - try { - writer.append(key, value); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - } - ongoingMap.put("ongoing", "" + counter.get()); - manager - .sendMessage( - new Message( - argumentParser.get("workflowId"), "Collection", MessageType.ONGOING, ongoingMap), - argumentParser.get("rabbitOngoingQueue"), - true, - false); - reportMap.put("collected", "" + counter.get()); - manager - .sendMessage( - new Message( - argumentParser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), - argumentParser.get("rabbitOngoingQueue"), - true, - false); - manager.close(); - } catch (Throwable e) { - throw new DnetCollectorException("Error on collecting ", e); - } - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java deleted file mode 100644 index da30e8793..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java +++ /dev/null @@ -1,49 +0,0 @@ - -package eu.dnetlib.dhp.collection.worker; - -import org.apache.commons.io.IOUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; -import eu.dnetlib.message.MessageManager; - -/** - * DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module - * will be executed on the hadoop cluster and taking in input some parameters that tells it which is the right collector - * plugin to use and where store the data into HDFS path - * - * @author Sandro La Bruzzo - */ -public class DnetCollectorWorkerApplication { - - private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class); - - private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory(); - - private static ArgumentApplicationParser argumentParser; - - /** @param args */ - public static void main(final String[] args) throws Exception { - - argumentParser = new ArgumentApplicationParser( - IOUtils - .toString( - DnetCollectorWorker.class - .getResourceAsStream( - "/eu/dnetlib/collector/worker/collector_parameter.json"))); - argumentParser.parseArgument(args); - log.info("hdfsPath =" + argumentParser.get("hdfsPath")); - log.info("json = " + argumentParser.get("apidescriptor")); - final MessageManager manager = new MessageManager( - argumentParser.get("rabbitHost"), - argumentParser.get("rabbitUser"), - argumentParser.get("rabbitPassword"), - false, - false, - null); - final DnetCollectorWorker worker = new DnetCollectorWorker(collectorPluginFactory, argumentParser, manager); - worker.collect(); - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java deleted file mode 100644 index dcaf0ea56..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java +++ /dev/null @@ -1,19 +0,0 @@ - -package eu.dnetlib.dhp.collection.worker.utils; - -import java.util.LinkedList; - -public class CollectorPluginErrorLogList extends LinkedList { - - private static final long serialVersionUID = -6925786561303289704L; - - @Override - public String toString() { - String log = ""; - int index = 0; - for (final String errorMessage : this) { - log += String.format("Retry #%s: %s / ", index++, errorMessage); - } - return log; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java deleted file mode 100644 index 7a0028e79..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java +++ /dev/null @@ -1,20 +0,0 @@ - -package eu.dnetlib.dhp.collection.worker.utils; - -import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; - -public class CollectorPluginFactory { - - public CollectorPlugin getPluginByProtocol(final String protocol) throws DnetCollectorException { - if (protocol == null) - throw new DnetCollectorException("protocol cannot be null"); - switch (protocol.toLowerCase().trim()) { - case "oai": - return new OaiCollectorPlugin(); - default: - throw new DnetCollectorException("UNknown protocol"); - } - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java deleted file mode 100644 index 5d6108fad..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java +++ /dev/null @@ -1,244 +0,0 @@ - -package eu.dnetlib.dhp.collection.worker.utils; - -import java.io.IOException; -import java.io.InputStream; -import java.net.*; -import java.security.GeneralSecurityException; -import java.security.cert.X509Certificate; -import java.util.List; -import java.util.Map; - -import javax.net.ssl.HttpsURLConnection; -import javax.net.ssl.SSLContext; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang.math.NumberUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; - -public class HttpConnector { - - private static final Log log = LogFactory.getLog(HttpConnector.class); - - private int maxNumberOfRetry = 6; - private int defaultDelay = 120; // seconds - private int readTimeOut = 120; // seconds - - private String responseType = null; - - private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; - - public HttpConnector() { - CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); - } - - /** - * Given the URL returns the content via HTTP GET - * - * @param requestUrl the URL - * @return the content of the downloaded resource - * @throws DnetCollectorException when retrying more than maxNumberOfRetry times - */ - public String getInputSource(final String requestUrl) throws DnetCollectorException { - return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); - } - - /** - * Given the URL returns the content as a stream via HTTP GET - * - * @param requestUrl the URL - * @return the content of the downloaded resource as InputStream - * @throws DnetCollectorException when retrying more than maxNumberOfRetry times - */ - public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException { - return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - } - - private String attemptDownlaodAsString( - final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) - throws DnetCollectorException { - try { - final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - try { - return IOUtils.toString(s); - } catch (final IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList); - } finally { - IOUtils.closeQuietly(s); - } - } catch (final InterruptedException e) { - throw new DnetCollectorException(e); - } - } - - private InputStream attemptDownload( - final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) - throws DnetCollectorException { - - if (retryNumber > maxNumberOfRetry) { - throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList); - } - - log.debug("Downloading " + requestUrl + " - try: " + retryNumber); - try { - InputStream input = null; - - try { - final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); - urlConn.setInstanceFollowRedirects(false); - urlConn.setReadTimeout(readTimeOut * 1000); - urlConn.addRequestProperty("User-Agent", userAgent); - - if (log.isDebugEnabled()) { - logHeaderFields(urlConn); - } - - final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); - if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) { - log.warn("waiting and repeating request after " + retryAfter + " sec."); - Thread.sleep(retryAfter * 1000); - errorList.add("503 Service Unavailable"); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM - || urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) { - final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); - log.debug("The requested url has been moved to " + newUrl); - errorList - .add( - String - .format( - "%s %s. Moved to: %s", - urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl)); - urlConn.disconnect(); - return attemptDownload(newUrl, retryNumber + 1, errorList); - } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) { - log - .error( - String - .format( - "HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - Thread.sleep(defaultDelay * 1000); - errorList - .add( - String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else { - input = urlConn.getInputStream(); - responseType = urlConn.getContentType(); - return input; - } - } catch (final IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } - } catch (final InterruptedException e) { - throw new DnetCollectorException(e); - } - } - - private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { - log.debug("StatusCode: " + urlConn.getResponseMessage()); - - for (final Map.Entry> e : urlConn.getHeaderFields().entrySet()) { - if (e.getKey() != null) { - for (final String v : e.getValue()) { - log.debug(" key: " + e.getKey() + " - value: " + v); - } - } - } - } - - private int obtainRetryAfter(final Map> headerMap) { - for (final String key : headerMap.keySet()) { - if (key != null - && key.toLowerCase().equals("retry-after") - && headerMap.get(key).size() > 0 - && NumberUtils.isNumber(headerMap.get(key).get(0))) { - return Integer.parseInt(headerMap.get(key).get(0)) + 10; - } - } - return -1; - } - - private String obtainNewLocation(final Map> headerMap) - throws DnetCollectorException { - for (final String key : headerMap.keySet()) { - if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) { - return headerMap.get(key).get(0); - } - } - throw new DnetCollectorException( - "The requested url has been MOVED, but 'location' param is MISSING"); - } - - /** - * register for https scheme; this is a workaround and not intended for the use in trusted environments - */ - public void initTrustManager() { - final X509TrustManager tm = new X509TrustManager() { - - @Override - public void checkClientTrusted(final X509Certificate[] xcs, final String string) { - } - - @Override - public void checkServerTrusted(final X509Certificate[] xcs, final String string) { - } - - @Override - public X509Certificate[] getAcceptedIssuers() { - return null; - } - }; - try { - final SSLContext ctx = SSLContext.getInstance("TLS"); - ctx.init(null, new TrustManager[] { - tm - }, null); - HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); - } catch (final GeneralSecurityException e) { - log.fatal(e); - throw new IllegalStateException(e); - } - } - - public int getMaxNumberOfRetry() { - return maxNumberOfRetry; - } - - public void setMaxNumberOfRetry(final int maxNumberOfRetry) { - this.maxNumberOfRetry = maxNumberOfRetry; - } - - public int getDefaultDelay() { - return defaultDelay; - } - - public void setDefaultDelay(final int defaultDelay) { - this.defaultDelay = defaultDelay; - } - - public int getReadTimeOut() { - return readTimeOut; - } - - public void setReadTimeOut(final int readTimeOut) { - this.readTimeOut = readTimeOut; - } - - public String getResponseType() { - return responseType; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java new file mode 100644 index 000000000..45bd844e2 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java @@ -0,0 +1,29 @@ + +package eu.dnetlib.dhp.transformation; + +public class DnetTransformationException extends Exception { + + public DnetTransformationException() { + super(); + } + + public DnetTransformationException( + final String message, + final Throwable cause, + final boolean enableSuppression, + final boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } + + public DnetTransformationException(final String message, final Throwable cause) { + super(message, cause); + } + + public DnetTransformationException(final String message) { + super(message); + } + + public DnetTransformationException(final Throwable cause) { + super(cause); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index 8737d36ef..6a0938708 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -1,15 +1,14 @@ package eu.dnetlib.dhp.transformation; +import static eu.dnetlib.dhp.common.Constants.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.utils.DHPUtils.*; -import java.io.ByteArrayInputStream; -import java.util.HashMap; +import java.io.IOException; import java.util.Map; -import java.util.Objects; import java.util.Optional; -import org.apache.commons.cli.*; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; @@ -17,22 +16,18 @@ import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; -import org.dom4j.Document; -import org.dom4j.DocumentException; -import org.dom4j.Node; -import org.dom4j.io.SAXReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.aggregation.common.AggregationCounter; +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; -import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; -import eu.dnetlib.dhp.utils.DHPUtils; -import eu.dnetlib.message.Message; -import eu.dnetlib.message.MessageManager; -import eu.dnetlib.message.MessageType; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.message.MessageSender; +import eu.dnetlib.dhp.schema.mdstore.MetadataRecord; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; public class TransformSparkJobNode { @@ -55,67 +50,85 @@ public class TransformSparkJobNode { .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String inputPath = parser.get("input"); - final String outputPath = parser.get("output"); - final String workflowId = parser.get("workflowId"); - final String trasformationRule = extractXSLTFromTR( - Objects.requireNonNull(DHPUtils.decompressString(parser.get("transformationRule")))); + final String mdstoreInputVersion = parser.get("mdstoreInputVersion"); + final String mdstoreOutputVersion = parser.get("mdstoreOutputVersion"); - final String rabbitUser = parser.get("rabbitUser"); - final String rabbitPassword = parser.get("rabbitPassword"); - final String rabbitHost = parser.get("rabbitHost"); - final String rabbitReportQueue = parser.get("rabbitReportQueue"); - final long dateOfCollection = new Long(parser.get("dateOfCollection")); - final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest")); + final MDStoreVersion nativeMdStoreVersion = MAPPER.readValue(mdstoreInputVersion, MDStoreVersion.class); + final String inputPath = nativeMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH; + log.info("inputPath: {}", inputPath); + + final MDStoreVersion cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, MDStoreVersion.class); + final String outputBasePath = cleanedMdStoreVersion.getHdfsPath(); + log.info("outputBasePath: {}", outputBasePath); + + final String isLookupUrl = parser.get("isLookupUrl"); + log.info(String.format("isLookupUrl: %s", isLookupUrl)); + + final String dateOfTransformation = parser.get("dateOfTransformation"); + log.info(String.format("dateOfTransformation: %s", dateOfTransformation)); + + final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl); + + final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService); + + log.info("Retrieved {} vocabularies", vocabularies.vocabularyNames().size()); SparkConf conf = new SparkConf(); runWithSparkSession( conf, isSparkSessionManaged, spark -> { - final Encoder encoder = Encoders.bean(MetadataRecord.class); - final Dataset mdstoreInput = spark.read().format("parquet").load(inputPath).as(encoder); - final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems"); - final LongAccumulator errorItems = spark.sparkContext().longAccumulator("errorItems"); - final LongAccumulator transformedItems = spark.sparkContext().longAccumulator("transformedItems"); - final Map vocabularies = new HashMap<>(); - vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); - final TransformFunction transformFunction = new TransformFunction( - totalItems, - errorItems, - transformedItems, - trasformationRule, - dateOfCollection, - vocabularies); - mdstoreInput.map(transformFunction, encoder).write().format("parquet").save(outputPath); - if (rabbitHost != null) { - System.out.println("SEND FINAL REPORT"); - final Map reportMap = new HashMap<>(); - reportMap.put("inputItem", "" + totalItems.value()); - reportMap.put("invalidRecords", "" + errorItems.value()); - reportMap.put("mdStoreSize", "" + transformedItems.value()); - System.out.println(new Message(workflowId, "Transform", MessageType.REPORT, reportMap)); - if (!test) { - final MessageManager manager = new MessageManager(rabbitHost, rabbitUser, rabbitPassword, false, - false, - null); - manager - .sendMessage( - new Message(workflowId, "Transform", MessageType.REPORT, reportMap), - rabbitReportQueue, - true, - false); - manager.close(); - } - } + transformRecords( + parser.getObjectMap(), isLookupService, spark, inputPath, outputBasePath); }); - } - private static String extractXSLTFromTR(final String tr) throws DocumentException { - SAXReader reader = new SAXReader(); - Document document = reader.read(new ByteArrayInputStream(tr.getBytes())); - Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); - return node.asXML(); + public static void transformRecords(final Map args, final ISLookUpService isLookUpService, + final SparkSession spark, final String inputPath, final String outputBasePath) + throws DnetTransformationException, IOException { + + final LongAccumulator totalItems = spark.sparkContext().longAccumulator(CONTENT_TOTALITEMS); + final LongAccumulator errorItems = spark.sparkContext().longAccumulator(CONTENT_INVALIDRECORDS); + final LongAccumulator transformedItems = spark.sparkContext().longAccumulator(CONTENT_TRANSFORMEDRECORDS); + final AggregationCounter ct = new AggregationCounter(totalItems, errorItems, transformedItems); + final Encoder encoder = Encoders.bean(MetadataRecord.class); + + final String dnetMessageManagerURL = args.get(DNET_MESSAGE_MGR_URL); + log.info("dnetMessageManagerURL is {}", dnetMessageManagerURL); + + final String workflowId = args.get("workflowId"); + log.info("workflowId is {}", workflowId); + + final MessageSender messageSender = new MessageSender(dnetMessageManagerURL, workflowId); + try (AggregatorReport report = new AggregatorReport(messageSender)) { + try { + final Dataset mdstore = spark + .read() + .format("parquet") + .load(inputPath) + .as(encoder) + .map( + TransformationFactory.getTransformationPlugin(args, ct, isLookUpService), + encoder); + saveDataset(mdstore, outputBasePath + MDSTORE_DATA_PATH); + + log.info("Transformed item " + ct.getProcessedItems().count()); + log.info("Total item " + ct.getTotalItems().count()); + log.info("Transformation Error item " + ct.getErrorItems().count()); + + final long mdStoreSize = spark.read().load(outputBasePath + MDSTORE_DATA_PATH).count(); + writeHdfsFile( + spark.sparkContext().hadoopConfiguration(), + "" + mdStoreSize, outputBasePath + MDSTORE_SIZE_PATH); + } catch (Throwable e) { + log.error("error during record transformation", e); + report.put(TransformSparkJobNode.class.getSimpleName(), e.getMessage()); + report.put(CONTENT_TOTALITEMS, ct.getTotalItems().value().toString()); + report.put(CONTENT_INVALIDRECORDS, ct.getErrorItems().value().toString()); + report.put(CONTENT_TRANSFORMEDRECORDS, ct.getProcessedItems().value().toString()); + throw e; + } + } } + } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java new file mode 100644 index 000000000..096d0e289 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java @@ -0,0 +1,69 @@ + +package eu.dnetlib.dhp.transformation; + +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.api.java.function.MapFunction; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.aggregation.common.AggregationCounter; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.mdstore.MetadataRecord; +import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + +public class TransformationFactory { + + private static final Logger log = LoggerFactory.getLogger(TransformationFactory.class); + public static final String TRULE_XQUERY = "for $x in collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType') where $x//RESOURCE_IDENTIFIER/@value = \"%s\" return $x//CODE/*[local-name() =\"stylesheet\"]"; + + public static MapFunction getTransformationPlugin( + final Map jobArgument, final AggregationCounter counters, final ISLookUpService isLookupService) + throws DnetTransformationException { + + try { + final String transformationPlugin = jobArgument.get("transformationPlugin"); + + log.info("Transformation plugin required " + transformationPlugin); + switch (transformationPlugin) { + case "XSLT_TRANSFORM": { + final String transformationRuleId = jobArgument.get("transformationRuleId"); + if (StringUtils.isBlank(transformationRuleId)) + throw new DnetTransformationException("Missing Parameter transformationRule"); + final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService); + + final String transformationRule = queryTransformationRuleFromIS( + transformationRuleId, isLookupService); + + final long dateOfTransformation = new Long(jobArgument.get("dateOfTransformation")); + return new XSLTTransformationFunction(counters, transformationRule, dateOfTransformation, + vocabularies); + + } + default: + throw new DnetTransformationException( + "transformation plugin does not exists for " + transformationPlugin); + + } + + } catch (Throwable e) { + throw new DnetTransformationException(e); + } + } + + private static String queryTransformationRuleFromIS(final String transformationRuleId, + final ISLookUpService isLookUpService) throws Exception { + final String query = String.format(TRULE_XQUERY, transformationRuleId); + System.out.println("asking query to IS: " + query); + List result = isLookUpService.quickSearchProfile(query); + + if (result == null || result.isEmpty()) + throw new DnetTransformationException( + "Unable to find transformation rule with name: " + transformationRuleId); + return result.get(0); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java deleted file mode 100644 index b5ac18169..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java +++ /dev/null @@ -1,53 +0,0 @@ - -package eu.dnetlib.dhp.transformation.vocabulary; - -import java.io.Serializable; - -public class Term implements Serializable { - - private String englishName; - private String nativeName; - private String encoding; - private String code; - private String synonyms; - - public String getEnglishName() { - return englishName; - } - - public void setEnglishName(String englishName) { - this.englishName = englishName; - } - - public String getNativeName() { - return nativeName; - } - - public void setNativeName(String nativeName) { - this.nativeName = nativeName; - } - - public String getEncoding() { - return encoding; - } - - public void setEncoding(String encoding) { - this.encoding = encoding; - } - - public String getCode() { - return code; - } - - public void setCode(String code) { - this.code = code; - } - - public String getSynonyms() { - return synonyms; - } - - public void setSynonyms(String synonyms) { - this.synonyms = synonyms; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java deleted file mode 100644 index a9da6b725..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java +++ /dev/null @@ -1,54 +0,0 @@ - -package eu.dnetlib.dhp.transformation.vocabulary; - -import java.io.Serializable; -import java.util.List; - -public class Vocabulary implements Serializable { - - private String id; - private String name; - private String description; - private String code; - private List terms; - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public String getDescription() { - return description; - } - - public void setDescription(String description) { - this.description = description; - } - - public String getCode() { - return code; - } - - public void setCode(String code) { - this.code = code; - } - - public List getTerms() { - return terms; - } - - public void setTerms(List terms) { - this.terms = terms; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java deleted file mode 100644 index 10e959be0..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java +++ /dev/null @@ -1,24 +0,0 @@ - -package eu.dnetlib.dhp.transformation.vocabulary; - -import java.io.Serializable; -import java.net.URL; -import java.nio.charset.Charset; - -import org.apache.commons.io.IOUtils; - -import com.fasterxml.jackson.databind.ObjectMapper; - -public class VocabularyHelper implements Serializable { - - private static final String OPENAIRE_URL = "http://api.openaire.eu/vocabularies/%s.json"; - - public static Vocabulary getVocabularyFromAPI(final String vocabularyName) throws Exception { - final URL url = new URL(String.format(OPENAIRE_URL, vocabularyName)); - - final String response = IOUtils.toString(url, Charset.defaultCharset()); - final ObjectMapper jsonMapper = new ObjectMapper(); - final Vocabulary vocabulary = jsonMapper.readValue(response, Vocabulary.class); - return vocabulary; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java similarity index 54% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java index 7f9b6646c..664215c0e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java @@ -1,25 +1,25 @@ -package eu.dnetlib.dhp.transformation.functions; +package eu.dnetlib.dhp.transformation.xslt; -import java.util.Map; -import java.util.Optional; +import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI; -import eu.dnetlib.dhp.transformation.vocabulary.Term; -import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; +import java.io.Serializable; + +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.oaf.Qualifier; import net.sf.saxon.s9api.*; -import scala.Serializable; public class Cleaner implements ExtensionFunction, Serializable { - private final Map vocabularies; + private final VocabularyGroup vocabularies; - public Cleaner(Map vocabularies) { + public Cleaner(final VocabularyGroup vocabularies) { this.vocabularies = vocabularies; } @Override public QName getName() { - return new QName("http://eu/dnetlib/trasform/extension", "clean"); + return new QName(QNAME_BASE_URI + "/clean", "clean"); } @Override @@ -30,23 +30,22 @@ public class Cleaner implements ExtensionFunction, Serializable { @Override public SequenceType[] getArgumentTypes() { return new SequenceType[] { - SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE), + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_MORE), SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE) }; } @Override public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { + XdmValue r = xdmValues[0]; + if (r.size() == 0) { + return new XdmAtomicValue(""); + } final String currentValue = xdmValues[0].itemAt(0).getStringValue(); final String vocabularyName = xdmValues[1].itemAt(0).getStringValue(); - Optional cleanedValue = vocabularies - .get(vocabularyName) - .getTerms() - .stream() - .filter(it -> it.getNativeName().equalsIgnoreCase(currentValue)) - .findAny(); + Qualifier cleanedValue = vocabularies.getSynonymAsQualifier(vocabularyName, currentValue); return new XdmAtomicValue( - cleanedValue.isPresent() ? cleanedValue.get().getCode() : currentValue); + cleanedValue != null ? cleanedValue.getClassid() : currentValue); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java new file mode 100644 index 000000000..6e337604f --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java @@ -0,0 +1,120 @@ + +package eu.dnetlib.dhp.transformation.xslt; + +import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI; + +import java.io.Serializable; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import net.sf.saxon.s9api.*; + +public class DateCleaner implements ExtensionFunction, Serializable { + + private final static List dateRegex = Arrays + .asList( + // Y-M-D + Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE), + // M-D-Y + Pattern + .compile( + "((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", + Pattern.MULTILINE), + // D-M-Y + Pattern + .compile( + "(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", + Pattern.MULTILINE), + // Y + Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)); + + private final static Pattern incompleteDateRegex = Pattern + .compile("^((18|19|20)\\d\\d){1}([- \\\\ \\/](0?[1-9]|1[012]))?", Pattern.MULTILINE); + + private final static List dformats = Arrays + .asList( + DateTimeFormatter + .ofPattern( + "[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", + Locale.ENGLISH), + DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)); + + public String clean(final String inputDate) { + + Optional cleanedDate = dateRegex + .stream() + .map( + p -> { + final Matcher matcher = p.matcher(inputDate); + if (matcher.find()) + return matcher.group(0); + else + return null; + }) + .filter(Objects::nonNull) + .map(m -> { + Optional cleanDate = dformats + .stream() + .map(f -> { + try { + LocalDate parsedDate = LocalDate.parse(m, f); + if (parsedDate != null) + return parsedDate.toString(); + else + return null; + } catch (Throwable e) { + return null; + } + } + + ) + .filter(Objects::nonNull) + .findAny(); + + return cleanDate.orElse(null); + }) + .filter(Objects::nonNull) + .findAny(); + + if (cleanedDate.isPresent()) + return cleanedDate.get(); + + final Matcher matcher = incompleteDateRegex.matcher(inputDate); + if (matcher.find()) { + final Integer year = Integer.parseInt(matcher.group(1)); + final Integer month = Integer.parseInt(matcher.group(4) == null ? "01" : matcher.group(4)); + return String.format("%d-%02d-01", year, month); + } + return null; + } + + @Override + public QName getName() { + return new QName(QNAME_BASE_URI + "/dateISO", "dateISO"); + } + + @Override + public SequenceType getResultType() { + return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE); + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE) + }; + } + + @Override + public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { + XdmValue r = xdmValues[0]; + if (r.size() == 0) { + return new XdmAtomicValue(""); + } + final String currentValue = xdmValues[0].itemAt(0).getStringValue(); + return new XdmAtomicValue(clean(currentValue)); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java similarity index 55% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java index f4bf78e18..430fbcf95 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java @@ -1,41 +1,39 @@ -package eu.dnetlib.dhp.transformation; +package eu.dnetlib.dhp.transformation.xslt; import java.io.ByteArrayInputStream; import java.io.StringWriter; -import java.util.Map; +import java.nio.charset.StandardCharsets; import javax.xml.transform.stream.StreamSource; +import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.util.LongAccumulator; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.transformation.functions.Cleaner; -import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; +import eu.dnetlib.dhp.aggregation.common.AggregationCounter; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.mdstore.MetadataRecord; import net.sf.saxon.s9api.*; -public class TransformFunction implements MapFunction { +public class XSLTTransformationFunction implements MapFunction { + + public final static String QNAME_BASE_URI = "http://eu/dnetlib/transform"; + + private final AggregationCounter aggregationCounter; - private final LongAccumulator totalItems; - private final LongAccumulator errorItems; - private final LongAccumulator transformedItems; private final String transformationRule; + private final Cleaner cleanFunction; private final long dateOfTransformation; - public TransformFunction( - LongAccumulator totalItems, - LongAccumulator errorItems, - LongAccumulator transformedItems, + public XSLTTransformationFunction( + final AggregationCounter aggregationCounter, final String transformationRule, long dateOfTransformation, - final Map vocabularies) + final VocabularyGroup vocabularies) throws Exception { - this.totalItems = totalItems; - this.errorItems = errorItems; - this.transformedItems = transformedItems; + this.aggregationCounter = aggregationCounter; this.transformationRule = transformationRule; this.dateOfTransformation = dateOfTransformation; cleanFunction = new Cleaner(vocabularies); @@ -43,32 +41,35 @@ public class TransformFunction implements MapFunction\u00b2Dok[\u00a7]" + }, + "OCEAN.OCEAN": { + "openaire_id": "re3data_____::r3d100012369", + "datacite_name": "Code Ocean", + "official_name": "Code Ocean" + }, + "CERN.ZENODO": { + "openaire_id": "re3data_____::r3d100010468", + "datacite_name": "Zenodo", + "official_name": "Zenodo" + }, + "ETHZ.DA-RD": { + "openaire_id": "re3data_____::r3d100011626", + "datacite_name": "ETHZ Data Archive - Research Data", + "official_name": "ETH Data Archive" + }, + "SND.ECDS": { + "openaire_id": "re3data_____::r3d100011000", + "datacite_name": "Environment Climate Data Sweden", + "official_name": "Environment Climate Data Sweden" + }, + "BL.BATH": { + "openaire_id": "re3data_____::r3d100011947", + "datacite_name": "University of Bath", + "official_name": "University of Bath Research Data Archive" + }, + "TIB.LDEO": { + "openaire_id": "re3data_____::r3d100012547", + "datacite_name": "LDEO - Lamont-Doherty Earth Observatory, Columbia University", + "official_name": "Lamont-Doherty Core Repository" + }, + "COS.OSF": { + "openaire_id": "re3data_____::r3d100011137", + "datacite_name": "Open Science Framework", + "official_name": "Open Science Framework" + }, + "ESTDOI.REPO": { + "openaire_id": "re3data_____::r3d100012333", + "datacite_name": "DataDOI", + "official_name": "DataDOI" + }, + "CDL.NSFADC": { + "openaire_id": "re3data_____::r3d100011973", + "datacite_name": "NSF Arctic Data Center", + "official_name": "NSF Arctic Data Center" + }, + "ANDS.CENTRE13": { + "openaire_id": "re3data_____::r3d100010477", + "datacite_name": "The Australian National University", + "official_name": "ANU Data Commons" + }, + "BL.NERC": { + "openaire_id": "re3data_____::r3d100010199", + "datacite_name": "Natural Environment Research Council", + "official_name": "Environmental Information Data Centre" + }, + "SAGEBIO.SYNAPSE": { + "openaire_id": "re3data_____::r3d100011894", + "datacite_name": "Synapse", + "official_name": "Synapse" + }, + "ANDS.CENTRE15": { + "openaire_id": "re3data_____::r3d100000038", + "datacite_name": "Australian Antarctic Division", + "official_name": "Australian Antarctic Data Centre" + }, + "WISC.BMRB": { + "openaire_id": "re3data_____::r3d100010191", + "datacite_name": "Biological Magnetic Resonance Bank", + "official_name": "Biological Magnetic Resonance Data Bank", + "similarity": 0.9315068493150684 + }, + "STSCI.MAST": { + "openaire_id": "re3data_____::r3d100010403", + "datacite_name": "Barbara A. Mikulski Archive for Space Telescopes", + "official_name": "Barbara A. Mikulski Archive for Space Telescopes" + }, + "CDL.NSIDC": { + "openaire_id": "re3data_____::r3d100010110", + "datacite_name": "National Snow and Ice Data Center", + "official_name": "National Snow and Ice Data Center" + }, + "BL.STRATH": { + "openaire_id": "re3data_____::r3d100012412", + "datacite_name": "University of Strathclyde", + "official_name": "University of Strathclyde KnowledgeBase Datasets" + }, + "DEMO.TDAR": { + "openaire_id": "re3data_____::r3d100010347", + "datacite_name": "The Digital Archaeological Record (tDAR)", + "official_name": "tDAR" + }, + "TIND.CALTECH": { + "openaire_id": "re3data_____::r3d100012384", + "datacite_name": "CaltechDATA", + "official_name": "CaltechDATA" + }, + "GESIS.BIBB-FDZ": { + "openaire_id": "re3data_____::r3d100010190", + "datacite_name": "Forschungsdatenzentrum im Bundesinstitut f\u00fcr Berufsbildung", + "official_name": "Forschungsdatenzentrum im Bundesinstitut f\u00fcr Berufsbildung" + }, + "ANDS.CENTRE87": { + "openaire_id": "re3data_____::r3d100010138", + "datacite_name": "Australian Data Archive", + "official_name": "Australian Data Archive" + }, + "GESIS.NEPS": { + "openaire_id": "re3data_____::r3d100010736", + "datacite_name": "Nationales Bildungspanel (National Educational Panel Study, NEPS)", + "official_name": "Nationales Bildungspanel" + }, + "CDL.UCBCRCNS": { + "openaire_id": "re3data_____::r3d100011269", + "datacite_name": "Collaborative Research in Computational Neuroscience (CRCNS)", + "official_name": "Collaborative Research in Computational Neuroscience" + }, + "TIB.UKON": { + "openaire_id": "re3data_____::r3d100010469", + "datacite_name": "Movebank", + "official_name": "Movebank" + }, + "UMN.IPUMS": { + "openaire_id": "re3data_____::r3d100010794", + "datacite_name": "Minnesota Population Center", + "official_name": "Minnesota Population Center" + }, + "TIB.BIKF": { + "openaire_id": "re3data_____::r3d100012379", + "datacite_name": "Senckenberg Data & Metadata Repository", + "official_name": "Senckenberg Data & Metadata Repository" + }, + "TDL.GRIIDC": { + "openaire_id": "re3data_____::r3d100011571", + "datacite_name": "Gulf of Mexico Research Initiative Information and Data Cooperative", + "official_name": "Gulf of Mexico Research Initiative Information and Data Cooperative" + }, + "DELFT.NIBG": { + "openaire_id": "re3data_____::r3d100012167", + "datacite_name": "Sound and Vision", + "official_name": "Sound and Vision" + }, + "BL.SURREY": { + "openaire_id": "re3data_____::r3d100012232", + "datacite_name": "University of Surrey", + "official_name": "Surrey Research Insight" + }, + "OSTI.ORNLNGEE": { + "openaire_id": "re3data_____::r3d100011676", + "datacite_name": "NGEE-Arctic (Next Generation Ecosystems Experiement)", + "official_name": "NGEE Arctic" + }, + "TIB.WDCRSAT": { + "openaire_id": "re3data_____::r3d100010156", + "datacite_name": "World Data Center for Remote Sensing of the Atmosphere", + "official_name": "The World Data Center for Remote Sensing of the Atmosphere", + "similarity": 0.9642857142857143 + }, + "ZBMED.DSMZ": { + "openaire_id": "re3data_____::r3d100010219", + "datacite_name": "DSMZ", + "official_name": "DSMZ" + }, + "DOINZ.NZAU": { + "openaire_id": "re3data_____::r3d100012110", + "datacite_name": "University of Auckland Data Publishing and Discovery Service", + "official_name": "University of Auckland Data Repository" + }, + "INIST.RESIF": { + "openaire_id": "re3data_____::r3d100012222", + "datacite_name": "R\u00e9seau sismologique et g\u00e9od\u00e9sique fran\u00e7ais", + "official_name": "RESIF Seismic Data Portal" + }, + "CDL.NCEAS": { + "openaire_id": "re3data_____::r3d100010093", + "datacite_name": "National Center for Ecological Analysis and Synthesis (NCEAS)", + "official_name": "National Center for Ecological Analysis and Synthesis Data Repository" + }, + "ZBMED.EMP": { + "openaire_id": "re3data_____::r3d100010234", + "datacite_name": "eyeMoviePedia", + "official_name": "eyeMoviePedia" + }, + "ZBMED.BIOFRESH": { + "openaire_id": "re3data_____::r3d100011651", + "datacite_name": "Project BioFresh, Leibniz-Institute of Freshwater Ecology and Inland Fisheries", + "official_name": "Freshwater Biodiversity Data Portal" + }, + "INIST.IFREMER": { + "openaire_id": "re3data_____::r3d100011867", + "datacite_name": "Institut Fran\u00e7ais de Recherche pour l'Exploitation de la Mer", + "official_name": "SEANOE" + }, + "ETHZ.SICAS": { + "openaire_id": "re3data_____::r3d100011560", + "datacite_name": "SICAS", + "official_name": "Sicas Medical Image Repository" + }, + "SND.SND": { + "openaire_id": "re3data_____::r3d100010146", + "datacite_name": "Swedish National Data Service", + "official_name": "Swedish National Data Service" + }, + "DELFT.EASY": { + "openaire_id": "re3data_____::r3d100011201", + "datacite_name": "DANS", + "official_name": "DataverseNL" + }, + "WH.WHOAS": { + "openaire_id": "re3data_____::r3d100010423", + "datacite_name": "Woods Hole Open Access Server", + "official_name": "Woods Hole Open Access Server" + }, + "DATACITE.UCSC": { + "openaire_id": "re3data_____::r3d100010243", + "datacite_name": "UCSC Genome Browser", + "official_name": "UCSC Genome Browser" + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json new file mode 100644 index 000000000..69fb039ba --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json @@ -0,0 +1,33 @@ +[ + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "the path of the sequencial file to write", + "paramRequired": true + }, + + { + "paramName": "d", + "paramLongName": "dataciteDumpPath", + "paramDescription": "the path of the Datacite dump", + "paramRequired": true + }, + { + "paramName": "s", + "paramLongName": "skipImport", + "paramDescription": "avoid to downlaod new items but apply the previous update", + "paramRequired": false + }, + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName": "m", + "paramLongName": "master", + "paramDescription": "the master name", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml new file mode 100644 index 000000000..dd3c32c62 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml @@ -0,0 +1,23 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + + oozie.launcher.mapreduce.user.classpath.first + true + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml new file mode 100644 index 000000000..15378c6c7 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml @@ -0,0 +1,129 @@ + + + + mdstoreInputPath + the path of the input MDStore + + + + mdstoreOutputPath + the path of the cleaned mdstore + + + nativeInputPath + the path of the input MDStore + + + skipimport + false + the path of the input MDStore + + + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + ${wf:conf('resumeFrom') eq 'TransformJob'} + ${wf:conf('resumeFrom') eq 'ExportDataset'} + + + + + + + yarn-cluster + cluster + ImportDatacite + eu.dnetlib.dhp.actionmanager.datacite.ImportDatacite + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + -t${nativeInputPath} + -d${mdstoreInputPath} + -n${nameNode} + -s${skipimport} + --masteryarn-cluster + + + + + + + + + yarn-cluster + cluster + TransformJob + eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${mdstoreInputPath} + --targetPath${mdstoreOutputPath} + --isLookupUrl${isLookupUrl} + -tr${isLookupUrl} + --masteryarn-cluster + + + + + + + + + + + + + + + + + yarn-cluster + cluster + ExportDataset + eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${mdstoreOutputPath} + --targetPath${mdstoreOutputPath}_raw_AS + --masteryarn-cluster + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json deleted file mode 100644 index 4a6aec5ee..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json +++ /dev/null @@ -1,86 +0,0 @@ -[ - { - "paramName": "issm", - "paramLongName": "isSparkSessionManaged", - "paramDescription": "when true will stop SparkSession after job execution", - "paramRequired": false - }, - { - "paramName": "e", - "paramLongName": "encoding", - "paramDescription": "the encoding of the input record should be JSON or XML", - "paramRequired": true - }, - { - "paramName": "d", - "paramLongName": "dateOfCollection", - "paramDescription": "the date when the record has been stored", - "paramRequired": true - }, - { - "paramName": "p", - "paramLongName": "provenance", - "paramDescription": "the infos about the provenance of the collected records", - "paramRequired": true - }, - { - "paramName": "x", - "paramLongName": "xpath", - "paramDescription": "the xpath to identify the record identifier", - "paramRequired": true - }, - { - "paramName": "i", - "paramLongName": "input", - "paramDescription": "the path of the sequencial file to read", - "paramRequired": true - }, - { - "paramName": "o", - "paramLongName": "output", - "paramDescription": "the path of the result DataFrame on HDFS", - "paramRequired": true - }, - { - "paramName": "ru", - "paramLongName": "rabbitUser", - "paramDescription": "the user to connect with RabbitMq for messaging", - "paramRequired": true - }, - { - "paramName": "rp", - "paramLongName": "rabbitPassword", - "paramDescription": "the password to connect with RabbitMq for messaging", - "paramRequired": true - }, - { - "paramName": "rh", - "paramLongName": "rabbitHost", - "paramDescription": "the host of the RabbitMq server", - "paramRequired": true - }, - { - "paramName": "ro", - "paramLongName": "rabbitOngoingQueue", - "paramDescription": "the name of the ongoing queue", - "paramRequired": true - }, - { - "paramName": "rr", - "paramLongName": "rabbitReportQueue", - "paramDescription": "the name of the report queue", - "paramRequired": true - }, - { - "paramName": "w", - "paramLongName": "workflowId", - "paramDescription": "the identifier of the dnet Workflow", - "paramRequired": true - }, - { - "paramName": "t", - "paramLongName": "isTest", - "paramDescription": "the name of the report queue", - "paramRequired": false - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json new file mode 100644 index 000000000..cd4b8224b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json @@ -0,0 +1,62 @@ +[ + { + "paramName": "a", + "paramLongName": "apidescriptor", + "paramDescription": "the JSON encoding of the API Descriptor", + "paramRequired": true + }, + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the Name Node URI", + "paramRequired": true + }, + { + "paramName": "mv", + "paramLongName": "mdStoreVersion", + "paramDescription": "the MDStore Version bean", + "paramRequired": true + }, + { + "paramName": "dm", + "paramLongName": "dnetMessageManagerURL", + "paramDescription": "the End point URL to send Messages", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workflowId", + "paramDescription": "the identifier of the dnet Workflow", + "paramRequired": true + }, + { + "paramName": "mnr", + "paramLongName": "maxNumberOfRetry", + "paramDescription": "the maximum number of admitted connection retries", + "paramRequired": false + }, + { + "paramName": "rqd", + "paramLongName": "requestDelay", + "paramDescription": "the delay (ms) between requests", + "paramRequired": false + }, + { + "paramName": "rtd", + "paramLongName": "retryDelay", + "paramDescription": "the delay (ms) between retries", + "paramRequired": false + }, + { + "paramName": "cto", + "paramLongName": "connectTimeOut", + "paramDescription": "the maximum allowed time (ms) to connect to the remote host", + "paramRequired": false + }, + { + "paramName": "rto", + "paramLongName": "readTimeOut", + "paramDescription": "the maximum allowed time (ms) to receive content from the remote host", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/generate_native_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/generate_native_input_parameters.json new file mode 100644 index 000000000..987f004bb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/generate_native_input_parameters.json @@ -0,0 +1,50 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "e", + "paramLongName": "encoding", + "paramDescription": "the encoding of the input record should be JSON or XML", + "paramRequired": true + }, + { + "paramName": "d", + "paramLongName": "dateOfCollection", + "paramDescription": "the date when the record has been stored", + "paramRequired": true + }, + { + "paramName": "p", + "paramLongName": "provenance", + "paramDescription": "the infos about the provenance of the collected records", + "paramRequired": true + }, + { + "paramName": "x", + "paramLongName": "xpath", + "paramDescription": "the xpath to identify the record identifier", + "paramRequired": true + }, + { + "paramName": "mv", + "paramLongName": "mdStoreVersion", + "paramDescription": "the Metadata Store Version Info", + "paramRequired": true + }, + { + "paramName": "rmv", + "paramLongName": "readMdStoreVersion", + "paramDescription": "the Read Lock Metadata Store Version bean", + "paramRequired": false + }, + { + "paramName": "w", + "paramLongName": "workflowId", + "paramDescription": "the identifier of the dnet Workflow", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/mdstore_action_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/mdstore_action_parameters.json new file mode 100644 index 000000000..57a218a34 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/mdstore_action_parameters.json @@ -0,0 +1,45 @@ +[ + { + "paramName": "a", + "paramLongName": "action", + "paramDescription": "the JSON encoding of the API Descriptor", + "paramRequired": true + }, + { + "paramName": "mu", + "paramLongName": "mdStoreManagerURI", + "paramDescription": "the MDStore Manager URI", + "paramRequired": true + }, + { + "paramName": "mi", + "paramLongName": "mdStoreID", + "paramDescription": "the Metadata Store ID", + "paramRequired": false + }, + { + "paramName": "ms", + "paramLongName": "mdStoreSize", + "paramDescription": "the Metadata Store Size", + "paramRequired": false + }, + { + "paramName": "mv", + "paramLongName": "mdStoreVersion", + "paramDescription": "the Metadata Version Bean", + "paramRequired": false + }, + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the Name Node URI", + "paramRequired": false + }, + { + "paramName": "rm", + "paramLongName": "readMDStoreId", + "paramDescription": "the ID Locked to Read", + "paramRequired": false + } + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml new file mode 100644 index 000000000..e77dd09c9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml @@ -0,0 +1,22 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + oozie.launcher.mapreduce.user.classpath.first + true + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml index 3e7f68401..0678eed11 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml @@ -1,112 +1,212 @@ - - sequenceFilePath - the path to store the sequence file of the native metadata collected - - - - mdStorePath - the path of the native mdstore - - apiDescription A json encoding of the API Description class - dataSourceInfo A json encoding of the Datasource Info identifierPath - An xpath to retrieve the metadata idnentifier for the generation of DNet Identifier + An xpath to retrieve the metadata identifier for the generation of DNet Identifier - metadataEncoding The type of the metadata XML/JSON - timestamp The timestamp of the collection date - workflowId The identifier of the workflow + + mdStoreID + The identifier of the mdStore + + + mdStoreManagerURI + The URI of the MDStore Manager + + + + dnetMessageManagerURL + The URI of the Dnet Message Manager + + + collectionMode + Should be REFRESH or INCREMENTAL + + + + collection_java_xmx + -Xmx200m + Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb. + + + - + + ${jobTracker} + ${nameNode} + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - + + + + ${wf:conf('collectionMode') eq 'REFRESH'} + ${wf:conf('collectionMode') eq 'INCREMENTAL'} + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} + --actionREAD_LOCK + --mdStoreID${mdStoreID} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} + --actionNEW_VERSION + --mdStoreID${mdStoreID} + --mdStoreManagerURI${mdStoreManagerURI} + + + - ${jobTracker} - ${nameNode} - eu.dnetlib.dhp.collection.worker.DnetCollectorWorker - -p${sequenceFilePath} - -a${apiDescription} - -n${nameNode} - -rh${rmq_host} - -ru${rmq_user} - -rp${rmq_pwd} - -rr${rmq_report} - -ro${rmq_ongoing} - -usandro.labruzzo - -w${workflowId} + eu.dnetlib.dhp.collection.CollectorWorkerApplication + ${collection_java_xmx} + --apidescriptor${apiDescription} + --namenode${nameNode} + --workflowId${workflowId} + --dnetMessageManagerURL${dnetMessageManagerURL} + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --maxNumberOfRetry${maxNumberOfRetry} + --requestDelay${requestDelay} + --retryDelay${retryDelay} + --connectTimeOut${connectTimeOut} + --readTimeOut${readTimeOut} - - - - - ${jobTracker} - ${nameNode} - yarn - cluster - GenerateNativeStoreSparkJob - eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob - dhp-aggregations-1.0.0-SNAPSHOT.jar - --num-executors 50 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" - --encoding ${metadataEncoding} - --dateOfCollection ${timestamp} - --provenance ${dataSourceInfo} - --xpath${identifierPath} - --input${sequenceFilePath} - --output${mdStorePath} - -rh${rmq_host} - -ru${rmq_user} - -rp${rmq_pwd} - -rr${rmq_report} - -ro${rmq_ongoing} - -w${workflowId} - - - + - - - - + + + yarn + cluster + Generate Native MetadataStore + eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --encoding${metadataEncoding} + --dateOfCollection${timestamp} + --provenance${dataSourceInfo} + --xpath${identifierPath} + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --readMdStoreVersion${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + + + + + + + + ${wf:conf('collectionMode') eq 'REFRESH'} + ${wf:conf('collectionMode') eq 'INCREMENTAL'} + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} + --actionREAD_UNLOCK + --mdStoreManagerURI${mdStoreManagerURI} + --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} + --actionCOMMIT + --namenode${nameNode} + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + + + + ${wf:conf('collectionMode') eq 'REFRESH'} + ${wf:conf('collectionMode') eq 'INCREMENTAL'} + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} + --actionREAD_UNLOCK + --mdStoreManagerURI${mdStoreManagerURI} + --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} + --actionROLLBACK + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdStoreManagerURI${mdStoreManagerURI} + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collector/worker/collector_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collector/worker/collector_parameter.json deleted file mode 100644 index c247d15e4..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collector/worker/collector_parameter.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - {"paramName":"p", "paramLongName":"hdfsPath", "paramDescription": "the path where storing the sequential file", "paramRequired": true}, - {"paramName":"a", "paramLongName":"apidescriptor", "paramDescription": "the JSON encoding of the API Descriptor", "paramRequired": true}, - {"paramName":"n", "paramLongName":"namenode", "paramDescription": "the Name Node URI", "paramRequired": true}, - {"paramName":"u", "paramLongName":"userHDFS", "paramDescription": "the user wich create the hdfs seq file", "paramRequired": true}, - {"paramName":"ru", "paramLongName":"rabbitUser", "paramDescription": "the user to connect with RabbitMq for messaging", "paramRequired": true}, - {"paramName":"rp", "paramLongName":"rabbitPassword", "paramDescription": "the password to connect with RabbitMq for messaging", "paramRequired": true}, - {"paramName":"rh", "paramLongName":"rabbitHost", "paramDescription": "the host of the RabbitMq server", "paramRequired": true}, - {"paramName":"ro", "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue", "paramRequired": true}, - {"paramName":"rr", "paramLongName":"rabbitReportQueue", "paramDescription": "the name of the report queue", "paramRequired": true}, - {"paramName":"w", "paramLongName":"workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": true} -] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml new file mode 100644 index 000000000..bdd48b0ab --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml @@ -0,0 +1,19 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml index 4b1e3d84b..61e5710fa 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml @@ -1,76 +1,187 @@ - mdstoreInputPath - the path of the input MDStore + mdStoreInputId + the identifier of the native MDStore - - mdstoreOutputPath + mdStoreOutputId + the identifier of the cleaned MDStore + + + mdStoreManagerURI the path of the cleaned mdstore - - transformationRule + transformationRuleId The transformation Rule to apply - - timestamp - The timestamp of the collection date + transformationPlugin + XSLT_TRANSFORM + The transformation Plugin + + + dateOfTransformation + The timestamp of the transformation date + + + isLookupUrl + The IS lookUp service endopoint - workflowId The identifier of the workflow + + dnetMessageManagerURL + The URI of the Dnet Message Manager + - + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionREAD_LOCK + --mdStoreID${mdStoreInputId} + --mdStoreManagerURI${mdStoreManagerURI} + + + - - - ${jobTracker} - ${nameNode} - yarn - cluster - MDBuilder - eu.dnetlib.dhp.transformation.TransformSparkJobNode - dhp-aggregations-1.0.0-SNAPSHOT.jar - --num-executors 50 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" - --dateOfCollection ${timestamp} - -mt yarn - --input${mdstoreInputPath} - --output${mdstoreOutputPath} - -w${workflowId} - -tr${transformationRule} - -ru${rmq_user} - -rp${rmq_pwd} - -rh${rmq_host} - -ro${rmq_ongoing} - -rr${rmq_report} - - - + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionNEW_VERSION + --mdStoreID${mdStoreOutputId} + --mdStoreManagerURI${mdStoreManagerURI} + + + + - - - - + + + yarn + cluster + Transform MetadataStore + eu.dnetlib.dhp.transformation.TransformSparkJobNode + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --mdstoreOutputVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdstoreInputVersion${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + --dateOfTransformation${dateOfTransformation} + --transformationPlugin${transformationPlugin} + --transformationRuleId${transformationRuleId} + --isLookupUrl${isLookupUrl} + --workflowId${workflowId} + --dnetMessageManagerURL${dnetMessageManagerURL} + + + + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionREAD_UNLOCK + --mdStoreManagerURI${mdStoreManagerURI} + --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + + + + + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionCOMMIT + --namenode${nameNode} + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionREAD_UNLOCK + --mdStoreManagerURI${mdStoreManagerURI} + --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + + + + + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionROLLBACK + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdStoreManagerURI${mdStoreManagerURI} + - + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json index 4bb5fd56a..ee9099dde 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json @@ -7,20 +7,39 @@ }, { "paramName": "d", - "paramLongName": "dateOfCollection", + "paramLongName": "dateOfTransformation", "paramDescription": "the date when the record has been stored", "paramRequired": true }, { "paramName": "i", - "paramLongName": "input", - "paramDescription": "the path of the sequencial file to read", + "paramLongName": "mdstoreInputVersion", + "paramDescription": "the mdStore Version bean of the Input", "paramRequired": true }, { "paramName": "o", - "paramLongName": "output", - "paramDescription": "the path of the result DataFrame on HDFS", + "paramLongName": "mdstoreOutputVersion", + "paramDescription": "the mdStore Version bean of the Output", + "paramRequired": true + }, + { + "paramName": "tr", + "paramLongName": "transformationRuleId", + "paramDescription": "the transformation Rule to apply to the input MDStore", + "paramRequired": true + }, + + { + "paramName": "i", + "paramLongName": "isLookupUrl", + "paramDescription": "the Information System Service LookUp URL", + "paramRequired": true + }, + { + "paramName": "dm", + "paramLongName": "dnetMessageManagerURL", + "paramDescription": "the End point URL to send Messages", "paramRequired": true }, { @@ -30,45 +49,9 @@ "paramRequired": true }, { - "paramName": "tr", - "paramLongName": "transformationRule", - "paramDescription": "the transformation Rule to apply to the input MDStore", + "paramName": "tp", + "paramLongName": "transformationPlugin", + "paramDescription": "the transformation plugin to apply", "paramRequired": true - }, - { - "paramName": "ru", - "paramLongName": "rabbitUser", - "paramDescription": "the user to connect with RabbitMq for messaging", - "paramRequired": true - }, - { - "paramName": "rp", - "paramLongName": "rabbitPassword", - "paramDescription": "the password to connect with RabbitMq for messaging", - "paramRequired": true - }, - { - "paramName": "rh", - "paramLongName": "rabbitHost", - "paramDescription": "the host of the RabbitMq server", - "paramRequired": true - }, - { - "paramName": "ro", - "paramLongName": "rabbitOngoingQueue", - "paramDescription": "the name of the ongoing queue", - "paramRequired": true - }, - { - "paramName": "rr", - "paramLongName": "rabbitReportQueue", - "paramDescription": "the name of the report queue", - "paramRequired": true - }, - { - "paramName": "t", - "paramLongName": "isTest", - "paramDescription": "the name of the report queue", - "paramRequired": false } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java index 59b536cd5..acb4caa22 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java @@ -12,15 +12,15 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -import eu.dnetlib.dhp.actionmanager.project.httpconnector.CollectorServiceException; -import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector; import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.HttpConnector2; @Disabled public class EXCELParserTest { private static Path workingDir; - private HttpConnector httpConnector = new HttpConnector(); + private HttpConnector2 httpConnector = new HttpConnector2(); private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx"; @BeforeAll @@ -30,7 +30,7 @@ public class EXCELParserTest { } @Test - public void test1() throws CollectorServiceException, IOException, InvalidFormatException, ClassNotFoundException, + public void test1() throws CollectorException, IOException, InvalidFormatException, ClassNotFoundException, IllegalAccessException, InstantiationException { EXCELParser excelParser = new EXCELParser(); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java deleted file mode 100644 index 90b3919ed..000000000 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java +++ /dev/null @@ -1,41 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.httpconnector; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.http.conn.ssl.SSLConnectionSocketFactory; -import org.apache.http.ssl.SSLContextBuilder; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -@Disabled -public class HttpConnectorTest { - - private static final Log log = LogFactory.getLog(HttpConnectorTest.class); - private static HttpConnector connector; - - private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx"; - private static final String URL_MISCONFIGURED_SERVER = "https://www.alexandria.unisg.ch/cgi/oai2?verb=Identify"; - private static final String URL_GOODSNI_SERVER = "https://air.unimi.it/oai/openaire?verb=Identify"; - - private static final SSLContextBuilder sslContextBuilder = new SSLContextBuilder(); - private static SSLConnectionSocketFactory sslSocketFactory; - - @BeforeAll - public static void setUp() { - connector = new HttpConnector(); - } - - @Test - - public void testGetInputSource() throws CollectorServiceException { - System.out.println(connector.getInputSource(URL)); - } - - @Test - public void testGoodServers() throws CollectorServiceException { - System.out.println(connector.getInputSource(URL_GOODSNI_SERVER)); - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AbstractVocabularyTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AbstractVocabularyTest.java new file mode 100644 index 000000000..8e0f0ce4b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AbstractVocabularyTest.java @@ -0,0 +1,52 @@ + +package eu.dnetlib.dhp.aggregation; + +import static org.mockito.Mockito.lenient; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.mockito.Mock; + +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.transformation.TransformationFactory; +import eu.dnetlib.dhp.transformation.TransformationJobTest; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + +public abstract class AbstractVocabularyTest { + + @Mock + protected ISLookUpService isLookUpService; + + protected VocabularyGroup vocabularies; + + public void setUpVocabulary() throws ISLookUpException, IOException { + lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs()); + + lenient() + .when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY)) + .thenReturn(synonyms()); + vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService); + } + + private static List vocs() throws IOException { + return IOUtils + .readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/terms.txt")); + } + + private static List synonyms() throws IOException { + return IOUtils + .readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/synonyms.txt")); + } + + protected void mockupTrasformationRule(final String trule, final String path) throws Exception { + final String trValue = IOUtils.toString(this.getClass().getResourceAsStream(path)); + + lenient() + .when(isLookUpService.quickSearchProfile(String.format(TransformationFactory.TRULE_XQUERY, trule))) + .thenReturn(Collections.singletonList(trValue)); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java deleted file mode 100644 index c3b05f5c9..000000000 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java +++ /dev/null @@ -1,117 +0,0 @@ - -package eu.dnetlib.dhp.collection; - -import static org.junit.jupiter.api.Assertions.*; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.sql.SparkSession; -import org.junit.jupiter.api.*; -import org.junit.jupiter.api.io.TempDir; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.model.mdstore.Provenance; -import eu.dnetlib.dhp.schema.common.ModelSupport; - -public class CollectionJobTest { - - private static SparkSession spark; - - @BeforeAll - public static void beforeAll() { - SparkConf conf = new SparkConf(); - conf.setAppName(CollectionJobTest.class.getSimpleName()); - conf.setMaster("local"); - spark = SparkSession.builder().config(conf).getOrCreate(); - } - - @AfterAll - public static void afterAll() { - spark.stop(); - } - - @Test - public void tesCollection(@TempDir Path testDir) throws Exception { - final Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix"); - Assertions.assertNotNull(new ObjectMapper().writeValueAsString(provenance)); - - GenerateNativeStoreSparkJob - .main( - new String[] { - "issm", "true", - "-w", "wid", - "-e", "XML", - "-d", "" + System.currentTimeMillis(), - "-p", new ObjectMapper().writeValueAsString(provenance), - "-x", "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "-i", this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(), - "-o", testDir.toString() + "/store", - "-t", "true", - "-ru", "", - "-rp", "", - "-rh", "", - "-ro", "", - "-rr", "" - }); - - // TODO introduce useful assertions - - } - - @Test - public void testGenerationMetadataRecord() throws Exception { - - final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); - - final MetadataRecord record = GenerateNativeStoreSparkJob - .parseRecord( - xml, - "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "XML", - new Provenance("foo", "bar", "ns_prefix"), - System.currentTimeMillis(), - null, - null); - - assertNotNull(record.getId()); - assertNotNull(record.getOriginalId()); - } - - @Test - public void TestEquals() throws IOException { - - final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); - final MetadataRecord record = GenerateNativeStoreSparkJob - .parseRecord( - xml, - "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "XML", - new Provenance("foo", "bar", "ns_prefix"), - System.currentTimeMillis(), - null, - null); - final MetadataRecord record1 = GenerateNativeStoreSparkJob - .parseRecord( - xml, - "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "XML", - new Provenance("foo", "bar", "ns_prefix"), - System.currentTimeMillis(), - null, - null); - - record.setBody("ciao"); - record1.setBody("mondo"); - - assertNotNull(record); - assertNotNull(record1); - assertEquals(record, record1); - } -} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionWorkflowTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionWorkflowTest.java new file mode 100644 index 000000000..cd6275d7f --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionWorkflowTest.java @@ -0,0 +1,113 @@ + +package eu.dnetlib.dhp.collection; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.MethodOrderer; +import org.junit.jupiter.api.TestMethodOrder; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.junit.jupiter.MockitoExtension; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; + +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) +@ExtendWith(MockitoExtension.class) +public class CollectionWorkflowTest { + + private static final Logger log = LoggerFactory.getLogger(CollectionWorkflowTest.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static Path workingDir; + + private static DistributedFileSystem fileSystem; + + // private static MiniDFSCluster hdfsCluster; + + private static ApiDescriptor api; + private static String mdStoreVersion; + + private static final String encoding = "XML"; + private static final String dateOfCollection = System.currentTimeMillis() + ""; + private static final String xpath = "//*[local-name()='header']/*[local-name()='identifier']"; + private static String provenance; + + private static final String msgMgrUrl = "http://localhost:%s/mock/mvc/dhp/message"; + + @BeforeAll + protected static void beforeAll() throws Exception { + provenance = IOUtils + .toString(CollectionWorkflowTest.class.getResourceAsStream("/eu/dnetlib/dhp/collection/provenance.json")); + + workingDir = Files.createTempDirectory(CollectionWorkflowTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + /* + * Configuration conf = new Configuration(); conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, + * workingDir.toString()); hdfsCluster = new MiniDFSCluster.Builder(conf).build(); fileSystem = + * hdfsCluster.getFileSystem(); api = OBJECT_MAPPER .readValue( + * IOUtils.toString(CollectionWorkflowTest.class.getResourceAsStream("apiDescriptor.json")), + * ApiDescriptor.class); mdStoreVersion = OBJECT_MAPPER + * .writeValueAsString(prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_1.json")); + */ + } + + @AfterAll + protected static void tearDown() { + /* + * hdfsCluster.shutdown(); FileUtil.fullyDelete(workingDir.toFile()); + */ + + } + + /** + + + eu.dnetlib.dhp.collection.worker.CollectorWorkerApplication + ${collection_java_xmx} + --apidescriptor${apiDescription} + --namenode${nameNode} + --workflowId${workflowId} + --dnetMessageManagerURL${dnetMessageManagerURL} + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --maxNumberOfRetry${maxNumberOfRetry} + --requestDelay${requestDelay} + --retryDelay${retryDelay} + --connectTimeOut${connectTimeOut} + --readTimeOut${readTimeOut} + + + + + + */ + // @Test + // @Order(1) + public void testCollectorWorkerApplication() throws Exception { + + final HttpClientParams httpClientParams = new HttpClientParams(); + + // String url = String.format(msgMgrUrl, wireMockServer.port()); + + // new CollectorWorkerApplication(fileSystem).run(mdStoreVersion, httpClientParams, api, url, "1234"); + + } + + public static MDStoreVersion prepareVersion(String filename) throws IOException { + MDStoreVersion mdstore = OBJECT_MAPPER + .readValue(IOUtils.toString(CollectionWorkflowTest.class.getResource(filename)), MDStoreVersion.class); + mdstore.setHdfsPath(String.format(mdstore.getHdfsPath(), workingDir.toString())); + return mdstore; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java new file mode 100644 index 000000000..b8eb58ec2 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java @@ -0,0 +1,300 @@ + +package eu.dnetlib.dhp.collection; + +import static eu.dnetlib.dhp.common.Constants.MDSTORE_DATA_PATH; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.junit.jupiter.MockitoExtension; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest; +import eu.dnetlib.dhp.schema.mdstore.MetadataRecord; +import eu.dnetlib.dhp.schema.mdstore.Provenance; +import eu.dnetlib.dhp.transformation.TransformSparkJobNode; + +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) +@ExtendWith(MockitoExtension.class) +public class GenerateNativeStoreSparkJobTest extends AbstractVocabularyTest { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static SparkSession spark; + + private static Path workingDir; + + private static Encoder encoder; + + private static final String encoding = "XML"; + private static final String dateOfCollection = System.currentTimeMillis() + ""; + private static final String xpath = "//*[local-name()='header']/*[local-name()='identifier']"; + private static String provenance; + + private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJobTest.class); + + @BeforeAll + public static void beforeAll() throws IOException { + provenance = IOUtils + .toString( + GenerateNativeStoreSparkJobTest.class + .getResourceAsStream("/eu/dnetlib/dhp/collection/provenance.json")); + workingDir = Files.createTempDirectory(GenerateNativeStoreSparkJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + + conf.setAppName(GenerateNativeStoreSparkJobTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + encoder = Encoders.bean(MetadataRecord.class); + spark = SparkSession + .builder() + .appName(GenerateNativeStoreSparkJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + @Order(1) + public void testGenerateNativeStoreSparkJobRefresh() throws Exception { + + MDStoreVersion mdStoreV1 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_1.json"); + FileUtils.forceMkdir(new File(mdStoreV1.getHdfsPath())); + + IOUtils + .copy( + getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/sequence_file"), + new FileOutputStream(mdStoreV1.getHdfsPath() + "/sequence_file")); + + GenerateNativeStoreSparkJob + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-encoding", encoding, + "-dateOfCollection", dateOfCollection, + "-provenance", provenance, + "-xpath", xpath, + "-mdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV1), + "-readMdStoreVersion", "", + "-workflowId", "abc" + }); + + verify(mdStoreV1); + } + + @Test + @Order(2) + public void testGenerateNativeStoreSparkJobIncremental() throws Exception { + + MDStoreVersion mdStoreV2 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_2.json"); + FileUtils.forceMkdir(new File(mdStoreV2.getHdfsPath())); + + IOUtils + .copy( + getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/sequence_file"), + new FileOutputStream(mdStoreV2.getHdfsPath() + "/sequence_file")); + + MDStoreVersion mdStoreV1 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_1.json"); + + GenerateNativeStoreSparkJob + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-encoding", encoding, + "-dateOfCollection", dateOfCollection, + "-provenance", provenance, + "-xpath", xpath, + "-mdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV2), + "-readMdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV1), + "-workflowId", "abc" + }); + + verify(mdStoreV2); + } + + @Test + @Order(3) + public void testTransformSparkJob() throws Exception { + + setUpVocabulary(); + + MDStoreVersion mdStoreV2 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_2.json"); + MDStoreVersion mdStoreCleanedVersion = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreCleanedVersion.json"); + + mockupTrasformationRule("simpleTRule", "/eu/dnetlib/dhp/transform/ext_simple.xsl"); + + final Map parameters = Stream.of(new String[][] { + { + "dateOfTransformation", "1234" + }, + { + "transformationPlugin", "XSLT_TRANSFORM" + }, + { + "transformationRuleId", "simpleTRule" + }, + + }).collect(Collectors.toMap(data -> data[0], data -> data[1])); + + TransformSparkJobNode + .transformRecords( + parameters, isLookUpService, spark, mdStoreV2.getHdfsPath() + MDSTORE_DATA_PATH, + mdStoreCleanedVersion.getHdfsPath()); + + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mOutput = spark + .read() + .format("parquet") + .load(mdStoreCleanedVersion.getHdfsPath() + MDSTORE_DATA_PATH) + .as(encoder); + + final Long total = mOutput.count(); + + final long recordTs = mOutput + .filter((FilterFunction) p -> p.getDateOfTransformation() == 1234) + .count(); + + final long recordNotEmpty = mOutput + .filter((FilterFunction) p -> !StringUtils.isBlank(p.getBody())) + .count(); + + assertEquals(total, recordTs); + + assertEquals(total, recordNotEmpty); + + } + + @Test + public void testJSONSerialization() throws Exception { + final String s = IOUtils.toString(getClass().getResourceAsStream("mdStoreVersion_1.json")); + System.out.println("s = " + s); + final ObjectMapper mapper = new ObjectMapper(); + MDStoreVersion mi = mapper.readValue(s, MDStoreVersion.class); + + assertNotNull(mi); + + } + + @Test + public void testGenerationMetadataRecord() throws Exception { + + final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); + + final MetadataRecord record = GenerateNativeStoreSparkJob + .parseRecord( + xml, + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "XML", + new Provenance("foo", "bar", "ns_prefix"), + System.currentTimeMillis(), + null, + null); + + assertNotNull(record.getId()); + assertNotNull(record.getOriginalId()); + } + + @Test + public void testEquals() throws IOException { + + final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); + final MetadataRecord record = GenerateNativeStoreSparkJob + .parseRecord( + xml, + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "XML", + new Provenance("foo", "bar", "ns_prefix"), + System.currentTimeMillis(), + null, + null); + final MetadataRecord record1 = GenerateNativeStoreSparkJob + .parseRecord( + xml, + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "XML", + new Provenance("foo", "bar", "ns_prefix"), + System.currentTimeMillis(), + null, + null); + + record.setBody("ciao"); + record1.setBody("mondo"); + + assertNotNull(record); + assertNotNull(record1); + assertEquals(record, record1); + } + + protected void verify(MDStoreVersion mdStoreVersion) throws IOException { + Assertions.assertTrue(new File(mdStoreVersion.getHdfsPath()).exists()); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + long seqFileSize = sc + .sequenceFile(mdStoreVersion.getHdfsPath() + "/sequence_file", IntWritable.class, Text.class) + .count(); + + final Dataset mdstore = spark.read().load(mdStoreVersion.getHdfsPath() + "/store").as(encoder); + long mdStoreSize = mdstore.count(); + + long declaredSize = Long.parseLong(IOUtils.toString(new FileReader(mdStoreVersion.getHdfsPath() + "/size"))); + + Assertions.assertEquals(seqFileSize, declaredSize, "the size must be equal"); + Assertions.assertEquals(seqFileSize, mdStoreSize, "the size must be equal"); + + long uniqueIds = mdstore + .map((MapFunction) MetadataRecord::getId, Encoders.STRING()) + .distinct() + .count(); + + Assertions.assertEquals(seqFileSize, uniqueIds, "the size must be equal"); + } + + public MDStoreVersion prepareVersion(String filename) throws IOException { + MDStoreVersion mdstore = OBJECT_MAPPER + .readValue(IOUtils.toString(getClass().getResource(filename)), MDStoreVersion.class); + mdstore.setHdfsPath(String.format(mdstore.getHdfsPath(), workingDir.toString())); + return mdstore; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java new file mode 100644 index 000000000..648ac85fb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java @@ -0,0 +1,81 @@ +/** + * + */ + +package eu.dnetlib.dhp.collection.plugin.rest; + +import java.util.HashMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Stream; + +import org.junit.jupiter.api.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.HttpClientParams; + +/** + * @author js, Andreas Czerniak + * + */ +public class RestCollectorPluginTest { + + private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class); + + private String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search"; + private String resumptionType = "count"; + private String resumptionParam = "from"; + private String entityXpath = "//hits/hits"; + private String resumptionXpath = "//hits"; + private String resultTotalXpath = "//hits/total"; + private String resultFormatParam = "format"; + private String resultFormatValue = "json"; + private String resultSizeParam = "size"; + private String resultSizeValue = "10"; + // private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29"; + private String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29"; + // private String query = "=(sources:engrXiv AND type:preprint)"; + + private String protocolDescriptor = "rest_json2xml"; + private ApiDescriptor api = new ApiDescriptor(); + private RestCollectorPlugin rcp; + + @BeforeEach + public void setUp() { + HashMap params = new HashMap<>(); + params.put("resumptionType", resumptionType); + params.put("resumptionParam", resumptionParam); + params.put("resumptionXpath", resumptionXpath); + params.put("resultTotalXpath", resultTotalXpath); + params.put("resultFormatParam", resultFormatParam); + params.put("resultFormatValue", resultFormatValue); + params.put("resultSizeParam", resultSizeParam); + params.put("resultSizeValue", resultSizeValue); + params.put("queryParams", query); + params.put("entityXpath", entityXpath); + + api.setBaseUrl(baseUrl); + api.setParams(params); + + rcp = new RestCollectorPlugin(new HttpClientParams()); + } + + @Disabled + @Test + public void test() throws CollectorException { + AtomicInteger i = new AtomicInteger(0); + final Stream stream = rcp.collect(api, new AggregatorReport()); + + stream.limit(200).forEach(s -> { + Assertions.assertTrue(s.length() > 0); + i.incrementAndGet(); + log.info(s); + }); + + log.info("{}", i.intValue()); + Assertions.assertTrue(i.intValue() > 0); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java new file mode 100644 index 000000000..16604e0eb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java @@ -0,0 +1,54 @@ +/** + * + */ + +package eu.dnetlib.dhp.collection.plugin.rest; + +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.collection.HttpClientParams; + +/** + * + * @author js, Andreas Czerniak + * @date 2020-04-08 + */ +public class RestIteratorTest { + + private static final Logger log = LoggerFactory.getLogger(RestIteratorTest.class); + + private String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search"; + private String resumptionType = "count"; + private String resumptionParam = "from"; + private String resumptionXpath = ""; + private String resultTotalXpath = "//hits/total"; + private String entityXpath = "//hits/hits"; + private String resultFormatParam = "format"; + private String resultFormatValue = "Json"; // Change from lowerCase to one UpperCase + private String resultSizeParam = "size"; + private String resultSizeValue = "10"; + private String authMethod = ""; + private String authToken = ""; + private String resultOffsetParam = "cursor"; + private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29"; + + @Disabled + @Test + public void test() { + + HttpClientParams clientParams = new HttpClientParams(); + + final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam, + resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue, + query, entityXpath, authMethod, authToken, resultOffsetParam); + int i = 20; + while (iterator.hasNext() && i > 0) { + String result = iterator.next(); + + i--; + } + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java new file mode 100644 index 000000000..b5ea5f069 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java @@ -0,0 +1,35 @@ + +package eu.dnetlib.dhp.collector.worker; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.collection.ApiDescriptor; + +@Disabled +public class CollectorWorkerApplicationTests { + + @Test + public void testCollectionOAI() throws Exception { + final ApiDescriptor api = new ApiDescriptor(); + api.setId("oai"); + api.setProtocol("oai"); + api.setBaseUrl("http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai"); + api.getParams().put("format", "oai_dc"); + ObjectMapper mapper = new ObjectMapper(); + assertNotNull(mapper.writeValueAsString(api)); + } + + private ApiDescriptor getApi() { + final ApiDescriptor api = new ApiDescriptor(); + api.setId("oai"); + api.setProtocol("oai"); + api.setBaseUrl("http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai"); + api.getParams().put("format", "oai_dc"); + return api; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java deleted file mode 100644 index c745219fe..000000000 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java +++ /dev/null @@ -1,94 +0,0 @@ - -package eu.dnetlib.dhp.collector.worker; - -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.mockito.Mockito.*; - -import java.io.File; - -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.collector.worker.model.ApiDescriptor; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.DnetCollectorWorker; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; -import eu.dnetlib.message.Message; -import eu.dnetlib.message.MessageManager; - -@Disabled -public class DnetCollectorWorkerApplicationTests { - - private final ArgumentApplicationParser argumentParser = mock(ArgumentApplicationParser.class); - private final MessageManager messageManager = mock(MessageManager.class); - - private DnetCollectorWorker worker; - - @BeforeEach - public void setup() throws Exception { - ObjectMapper mapper = new ObjectMapper(); - final String apiJson = mapper.writeValueAsString(getApi()); - when(argumentParser.get("apidescriptor")).thenReturn(apiJson); - when(argumentParser.get("namenode")).thenReturn("file://tmp/test.seq"); - when(argumentParser.get("hdfsPath")).thenReturn("/tmp/file.seq"); - when(argumentParser.get("userHDFS")).thenReturn("sandro"); - when(argumentParser.get("workflowId")).thenReturn("sandro"); - when(argumentParser.get("rabbitOngoingQueue")).thenReturn("sandro"); - - when(messageManager.sendMessage(any(Message.class), anyString(), anyBoolean(), anyBoolean())) - .thenAnswer( - a -> { - System.out.println("sent message: " + a.getArguments()[0]); - return true; - }); - when(messageManager.sendMessage(any(Message.class), anyString())) - .thenAnswer( - a -> { - System.out.println("Called"); - return true; - }); - worker = new DnetCollectorWorker(new CollectorPluginFactory(), argumentParser, messageManager); - } - - @AfterEach - public void dropDown() { - File f = new File("/tmp/file.seq"); - f.delete(); - } - - @Test - public void testFindPlugin() throws Exception { - final CollectorPluginFactory collectorPluginEnumerator = new CollectorPluginFactory(); - assertNotNull(collectorPluginEnumerator.getPluginByProtocol("oai")); - assertNotNull(collectorPluginEnumerator.getPluginByProtocol("OAI")); - } - - @Test - public void testCollectionOAI() throws Exception { - final ApiDescriptor api = new ApiDescriptor(); - api.setId("oai"); - api.setProtocol("oai"); - api.setBaseUrl("http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai"); - api.getParams().put("format", "oai_dc"); - ObjectMapper mapper = new ObjectMapper(); - assertNotNull(mapper.writeValueAsString(api)); - } - - @Test - public void testFeeding() throws Exception { - worker.collect(); - } - - private ApiDescriptor getApi() { - final ApiDescriptor api = new ApiDescriptor(); - api.setId("oai"); - api.setProtocol("oai"); - api.setBaseUrl("http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai"); - api.getParams().put("format", "oai_dc"); - return api; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index 98c8cf66c..f3a0685ac 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -1,165 +1,252 @@ package eu.dnetlib.dhp.transformation; -import static org.junit.jupiter.api.Assertions.assertNotNull; +import static eu.dnetlib.dhp.common.Constants.MDSTORE_DATA_PATH; +import static org.junit.jupiter.api.Assertions.assertEquals; -import java.io.StringWriter; -import java.nio.file.Files; +import java.io.IOException; import java.nio.file.Path; -import java.util.HashMap; import java.util.Map; - -import javax.xml.transform.stream.StreamSource; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; -import org.dom4j.Document; -import org.dom4j.Node; -import org.dom4j.io.SAXReader; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.DisplayName; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.*; import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.api.io.TempDir; -import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; -import eu.dnetlib.dhp.collection.CollectionJobTest; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.transformation.functions.Cleaner; -import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; -import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; -import eu.dnetlib.dhp.utils.DHPUtils; -import net.sf.saxon.s9api.*; +import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest; +import eu.dnetlib.dhp.aggregation.common.AggregationCounter; +import eu.dnetlib.dhp.schema.mdstore.MetadataRecord; +import eu.dnetlib.dhp.transformation.xslt.DateCleaner; +import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; @ExtendWith(MockitoExtension.class) -public class TransformationJobTest { +public class TransformationJobTest extends AbstractVocabularyTest { - private static SparkSession spark; + private SparkConf sparkConf; - @BeforeAll - public static void beforeAll() { - SparkConf conf = new SparkConf(); - conf.setAppName(CollectionJobTest.class.getSimpleName()); - conf.setMaster("local"); - spark = SparkSession.builder().config(conf).getOrCreate(); + @BeforeEach + public void setUp() throws IOException, ISLookUpException { + setUpVocabulary(); + + sparkConf = new SparkConf(); + sparkConf.setMaster("local[*]"); + sparkConf.set("spark.driver.host", "localhost"); + sparkConf.set("spark.ui.enabled", "false"); } - @AfterAll - public static void afterAll() { - spark.stop(); - } - - @Mock - private LongAccumulator accumulator; - @Test + @DisplayName("Test Date cleaner") + public void testDateCleaner() throws Exception { + DateCleaner dc = new DateCleaner(); + assertEquals(dc.clean("20/09/1982"), "1982-09-20"); + assertEquals(dc.clean("20-09-2002"), "2002-09-20"); + assertEquals(dc.clean("2002-09-20"), "2002-09-20"); + assertEquals(dc.clean("2002-9"), "2002-09-01"); + assertEquals(dc.clean("2021"), "2021-01-01"); + } + + @Test + @DisplayName("Test Transform Single XML using zenodo_tr XSLTTransformator") public void testTransformSaxonHE() throws Exception { - Map vocabularies = new HashMap<>(); - vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); - Cleaner cleanFunction = new Cleaner(vocabularies); - Processor proc = new Processor(false); - proc.registerExtensionFunction(cleanFunction); - final XsltCompiler comp = proc.newXsltCompiler(); - XsltExecutable exp = comp - .compile( - new StreamSource( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/ext_simple.xsl"))); - XdmNode source = proc - .newDocumentBuilder() - .build( - new StreamSource( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); - XsltTransformer trans = exp.load(); - trans.setInitialContextNode(source); - final StringWriter output = new StringWriter(); - Serializer out = proc.newSerializer(output); - out.setOutputProperty(Serializer.Property.METHOD, "xml"); - out.setOutputProperty(Serializer.Property.INDENT, "yes"); - trans.setDestination(out); - trans.transform(); - System.out.println(output.toString()); - } + // We Set the input Record getting the XML from the classpath + final MetadataRecord mr = new MetadataRecord(); + mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_zenodo.xml"))); + // We Load the XSLT transformation Rule from the classpath + XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/zenodo_tr.xslt"); - @DisplayName("Test TransformSparkJobNode.main") - @Test - public void transformTest(@TempDir Path testDir) throws Exception { - final String mdstore_input = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); - final String mdstore_output = testDir.toString() + "/version"; - final String xslt = DHPUtils - .compressString( - IOUtils - .toString( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml"))); - TransformSparkJobNode - .main( - new String[] { - "-issm", "true", - "-i", mdstore_input, - "-o", mdstore_output, - "-d", "1", - "-w", "1", - "-tr", xslt, - "-t", "true", - "-ru", "", - "-rp", "", - "-rh", "", - "-ro", "", - "-rr", "" - }); - - // TODO introduce useful assertions - } - - @Test - public void tryLoadFolderOnCP() throws Exception { - final String path = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); - System.out.println("path = " + path); - - Path tempDirWithPrefix = Files.createTempDirectory("mdstore_output"); - - System.out.println(tempDirWithPrefix.toFile().getAbsolutePath()); - - Files.deleteIfExists(tempDirWithPrefix); - } - - @Test - public void testTransformFunction() throws Exception { - SAXReader reader = new SAXReader(); - Document document = reader.read(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); - Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); - final String xslt = node.asXML(); - Map vocabularies = new HashMap<>(); - vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); - - TransformFunction tf = new TransformFunction(accumulator, accumulator, accumulator, xslt, 1, vocabularies); - - MetadataRecord record = new MetadataRecord(); - record - .setBody( - IOUtils - .toString( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); - - final MetadataRecord result = tf.call(record); - assertNotNull(result.getBody()); + MetadataRecord result = tr.call(mr); + // Print the record System.out.println(result.getBody()); + // TODO Create significant Assert } @Test - public void extractTr() throws Exception { + @DisplayName("Test Transform Inst.&Them.v4 record XML with zenodo_tr") + public void testTransformITGv4Zenodo() throws Exception { - final String xmlTr = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); + // We Set the input Record getting the XML from the classpath + final MetadataRecord mr = new MetadataRecord(); + mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml"))); + // We Load the XSLT transformation Rule from the classpath + XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/zenodo_tr.xslt"); - SAXReader reader = new SAXReader(); - Document document = reader.read(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); - Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); + MetadataRecord result = tr.call(mr); - System.out.println(node.asXML()); + // Print the record + System.out.println(result.getBody()); + // TODO Create significant Assert } + + @Test + @DisplayName("Test Transform Inst.&Them.v4 record XML with xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid") + public void testTransformITGv4() throws Exception { + + // We Set the input Record getting the XML from the classpath + final MetadataRecord mr = new MetadataRecord(); + mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml"))); + // We Load the XSLT transformation Rule from the classpath + XSLTTransformationFunction tr = loadTransformationRule( + "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl"); + + MetadataRecord result = tr.call(mr); + + // Print the record + System.out.println(result.getBody()); + // TODO Create significant Assert + } + + @Test + @DisplayName("Test Transform record XML with xslt_cleaning_datarepo_datacite") + public void testTransformMostlyUsedScript() throws Exception { + + // We Set the input Record getting the XML from the classpath + final MetadataRecord mr = new MetadataRecord(); + mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml"))); + // We Load the XSLT transformation Rule from the classpath + XSLTTransformationFunction tr = loadTransformationRule( + "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl"); + + MetadataRecord result = tr.call(mr); + + // Print the record + System.out.println(result.getBody()); + // TODO Create significant Assert + } + + @Test + @DisplayName("Test TransformSparkJobNode.main with oaiOpenaire_datacite (v4)") + public void transformTestITGv4OAIdatacite(@TempDir Path testDir) throws Exception { + + try (SparkSession spark = SparkSession.builder().config(sparkConf).getOrCreate()) { + + final String mdstore_input = this + .getClass() + .getResource("/eu/dnetlib/dhp/transform/mdstorenative") + .getFile(); + final String mdstore_output = testDir.toString() + "/version"; + + mockupTrasformationRule( + "simpleTRule", + "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl"); + + final Map parameters = Stream.of(new String[][] { + { + "dateOfTransformation", "1234" + }, + { + "varOfficialName", "Publications at Bielefeld University" + }, + { + "varOfficialId", "opendoar____::2294" + }, + { + "transformationPlugin", "XSLT_TRANSFORM" + }, + { + "transformationRuleId", "simpleTRule" + }, + + }).collect(Collectors.toMap(data -> data[0], data -> data[1])); + + TransformSparkJobNode.transformRecords(parameters, isLookUpService, spark, mdstore_input, mdstore_output); + + // TODO introduce useful assertions + + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mOutput = spark + .read() + .format("parquet") + .load(mdstore_output + MDSTORE_DATA_PATH) + .as(encoder); + + final Long total = mOutput.count(); + + final long recordTs = mOutput + .filter((FilterFunction) p -> p.getDateOfTransformation() == 1234) + .count(); + + final long recordNotEmpty = mOutput + .filter((FilterFunction) p -> !StringUtils.isBlank(p.getBody())) + .count(); + + assertEquals(total, recordTs); + + assertEquals(total, recordNotEmpty); + } + } + + @Test + @DisplayName("Test TransformSparkJobNode.main") + public void transformTest(@TempDir Path testDir) throws Exception { + + try (SparkSession spark = SparkSession.builder().config(sparkConf).getOrCreate()) { + + final String mdstore_input = this + .getClass() + .getResource("/eu/dnetlib/dhp/transform/mdstorenative") + .getFile(); + final String mdstore_output = testDir.toString() + "/version"; + + mockupTrasformationRule("simpleTRule", "/eu/dnetlib/dhp/transform/ext_simple.xsl"); + + final Map parameters = Stream.of(new String[][] { + { + "dateOfTransformation", "1234" + }, + { + "transformationPlugin", "XSLT_TRANSFORM" + }, + { + "transformationRuleId", "simpleTRule" + }, + + }).collect(Collectors.toMap(data -> data[0], data -> data[1])); + + TransformSparkJobNode.transformRecords(parameters, isLookUpService, spark, mdstore_input, mdstore_output); + + // TODO introduce useful assertions + + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mOutput = spark + .read() + .format("parquet") + .load(mdstore_output + MDSTORE_DATA_PATH) + .as(encoder); + + final Long total = mOutput.count(); + + final long recordTs = mOutput + .filter((FilterFunction) p -> p.getDateOfTransformation() == 1234) + .count(); + + final long recordNotEmpty = mOutput + .filter((FilterFunction) p -> !StringUtils.isBlank(p.getBody())) + .count(); + + assertEquals(total, recordTs); + + assertEquals(total, recordNotEmpty); + } + } + + private XSLTTransformationFunction loadTransformationRule(final String path) throws Exception { + final String trValue = IOUtils.toString(this.getClass().getResourceAsStream(path)); + final LongAccumulator la = new LongAccumulator(); + return new XSLTTransformationFunction(new AggregationCounter(la, la, la), trValue, 0, vocabularies); + } + } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java deleted file mode 100644 index 1ae942a6b..000000000 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java +++ /dev/null @@ -1,16 +0,0 @@ - -package eu.dnetlib.dhp.transformation.vocabulary; - -import static org.junit.jupiter.api.Assertions.*; - -import org.junit.jupiter.api.Test; - -public class VocabularyTest { - - @Test - public void testLoadVocabulary() throws Exception { - - final Vocabulary vocabulary = VocabularyHelper.getVocabularyFromAPI("dnet:languages"); - assertEquals("dnet:languages", vocabulary.getName()); - } -} diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/apiDescriptor.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/apiDescriptor.json new file mode 100644 index 000000000..99957cac9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/apiDescriptor.json @@ -0,0 +1,10 @@ +{ + "id":"api_________::opendoar____::2::0", + "baseUrl":"https://www.alexandria.unisg.ch/cgi/oai2", + "protocol":"oai", + "params": { + "set":"driver", + "metadata_identifier_path":"//*[local-name()\u003d\u0027header\u0027]/*[local-name()\u003d\u0027identifier\u0027]", + "format":"oai_dc" + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreCleanedVersion.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreCleanedVersion.json new file mode 100644 index 000000000..a5adc8fda --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreCleanedVersion.json @@ -0,0 +1,9 @@ +{ + "id":"md-cleaned", + "mdstore":"md-cleaned", + "writing":false, + "readCount":1, + "lastUpdate":1612187563099, + "size":71, + "hdfsPath":"%s/mdstore/md-cleaned" +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreVersion_1.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreVersion_1.json new file mode 100644 index 000000000..8945c3d88 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreVersion_1.json @@ -0,0 +1,9 @@ +{ + "id":"md-84e86d00-5771-4ed9-b17f-177ef4b46e42-1612187678801", + "mdstore":"md-84e86d00-5771-4ed9-b17f-177ef4b46e42", + "writing":true, + "readCount":0, + "lastUpdate":null, + "size":0, + "hdfsPath":"%s/mdstore/md-84e86d00-5771-4ed9-b17f-177ef4b46e42/v1" +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreVersion_2.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreVersion_2.json new file mode 100644 index 000000000..c3d4617cb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreVersion_2.json @@ -0,0 +1,9 @@ +{ + "id":"md-84e86d00-5771-4ed9-b17f-177ef4b46e42-1612187459108", + "mdstore":"md-84e86d00-5771-4ed9-b17f-177ef4b46e42", + "writing":false, + "readCount":1, + "lastUpdate":1612187563099, + "size":71, + "hdfsPath":"%s/mdstore/md-84e86d00-5771-4ed9-b17f-177ef4b46e42/v2" +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/provenance.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/provenance.json new file mode 100644 index 000000000..2cf0dab70 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/provenance.json @@ -0,0 +1,5 @@ +{ + "datasourceId":"74912366-d6df-49c1-a1fd-8a52fa98ce5f_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU\u003d", + "datasourceName":"PSNC Institutional Repository", + "nsPrefix":"psnc______pl" +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/sequence_file b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/sequence_file new file mode 100644 index 000000000..309645a5f Binary files /dev/null and b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/sequence_file differ diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl index cef50aa95..8f8ce2270 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl @@ -1,15 +1,19 @@ + exclude-result-prefixes="xsl vocabulary"> + - - + + + + diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input.xml index 8760d3117..3d136d56d 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input.xml +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input.xml @@ -1,37 +1,32 @@

- oai:research.chalmers.se:243692 - 2018-01-25T18:04:43Z - openaire + oai:lib.psnc.pl:278 + 2011-08-25T15:17:13Z + PSNCRepository:PSNCExternalRepository:exhibitions + PSNCRepository:PSNCExternalRepository:Departments + PSNCRepository:PSNCExternalRepository:Departments:NetworkServices + PSNCRepository:PSNCExternalRepository + aRTIcle - Letter to the editor + PSNCRepository
- - Incipient Berezinskii-Kosterlitz-Thouless transition in two-dimensional coplanar Josephson junctions - https://research.chalmers.se/en/publication/243692 - 2016 - Massarotti, D. - Jouault, B. - Rouco, V. - Charpentier, Sophie - Bauch, Thilo - Michon, A. - De Candia, A. - Lucignano, P. - Lombardi, Floriana - Tafuri, F. - Tagliacozzo, A. - Acoli - Abkhazian - Condensed Matter Physics - Superconducting hybrid junctions are revealing a variety of effects. Some of them are due to the special layout of these devices, which often use a coplanar configuration with relatively large barrier channels and the possibility of hosting Pearl vortices. A Josephson junction with a quasi-ideal two-dimensional barrier has been realized by growing graphene on SiC with Al electrodes. Chemical vapor deposition offers centimeter size monolayer areas where it is possible to realize a comparative analysis of different devices with nominally the same barrier. In samples with a graphene gap below 400 nm, we have found evidence of Josephson coherence in the presence of an incipient Berezinskii-Kosterlitz-Thouless transition. When the magnetic field is cycled, a remarkable hysteretic collapse and revival of the Josephson supercurrent occurs. Similar hysteresis are found in granular systems and are usually justified within the Bean critical state model (CSM). We show that the CSM, with appropriate account for the low-dimensional geometry, can partly explain the odd features measured in these junctions. - info:eu-repo/grantAgreement/EC/FP7/604391//Graphene-Based Revolutions in ICT And Beyond (Graphene Flagship)/ - info:eu-repo/semantics/altIdentifier/doi/10.1103/PhysRevB.94.054525 - info:eu-repo/semantics/article - Physical Review B vol.94(2016) - info:eu-repo/semantics/openAccess - eng - Researchers - application/pdf + + + + + + + + + + + + + + + - + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_itgv4.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_itgv4.xml new file mode 100644 index 000000000..06325810b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_itgv4.xml @@ -0,0 +1,70 @@ + + + + + od______2294::0000955eab68583ba0e07e973dd48708 + oai:pub.uni-bielefeld.de:1997560 + 2021-02-23T13:14:00.839Z + od______2294 + oai:pub.uni-bielefeld.de:1997560 + 2018-07-24T12:58:03Z + journal_article + doc-type:article + + + + Die antiken Grundlagen der europäischen Expansion. Eine epochenübergreifende kulturhistorische Unterrichtseinheit + + + Schulz, Raimund + + + + https://pub.uni-bielefeld.de/record/1997560.json + + + 0016-9056 + + ger + Friedrich + 2002 + journal article + https://pub.uni-bielefeld.de/record/1997560 + metadata only access + Schulz R. Die antiken Grundlagen der europäischen Expansion. Eine epochenübergreifende kulturhistorische Unterrichtseinheit. GWU. 2002;53(5-/6):340-360. + + In Copyright + GWU + 53 + 5-/6 + + + + + + http%3A%2F%2Fpub.uni-bielefeld.de%2Foai + oai:pub.uni-bielefeld.de:1997560 + 2018-07-24T12:58:03Z + + + + + false + false + 0.9 + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_zenodo.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_zenodo.xml new file mode 100644 index 000000000..871f1ed25 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_zenodo.xml @@ -0,0 +1,99 @@ + + + + r37b0ad08687::000374d100a9db469bd42b69dbb40b36 + 10.5281/zenodo.3234526 + 2020-03-23T03:03:50.72Z + r37b0ad08687 + oai:zenodo.org:3234526 + 2020-03-19T10:58:08Z + openaire_data + user-epfl + + + + true + 3.1 + CERN.ZENODO + + + 10.5281/zenodo.3234526 + + + Nouchi, Vincent + Physics of Aquatic Systems Laboratory (APHYS) – Margaretha Kamprad Chair, ENAC, EPFL, Lausanne, 1015, Switzerland + + + Lavanchy, Sébastien + Physics of Aquatic Systems Laboratory (APHYS) – Margaretha Kamprad Chair, ENAC, EPFL, Lausanne, 1015, Switzerland + + + Baracchini, Theo + Physics of Aquatic Systems Laboratory (APHYS) – Margaretha Kamprad Chair, ENAC, EPFL, Lausanne, 1015, Switzerland + + + Wüest, Alfred + Physics of Aquatic Systems Laboratory (APHYS) – Margaretha Kamprad Chair, ENAC, EPFL, Lausanne, 1015, Switzerland + + + Bouffard, Damien + Eawag, Swiss Federal Institute of Aquatic Science and Technology, Surface Waters – Research and Management, Kastanienbaum, 6047, Switzerland + + + + Temperature and ADCP data collected on Lake Geneva between 2015 and 2017 + + Zenodo + 2019 + + Lake Geneva + temperature + ADCP + + + 2019 + + + + 10.5281/zenodo.3234525 + https://zenodo.org/communities/epfl + + 1.0.0 + + Creative Commons Attribution 4.0 International + Open Access + + +

Data collected between 2015 and 2017 on Lake Geneva by Acoustic Doppler Current Profiler (ADCP) and CTDs. One file includes all the temperature profiles, the two others are the ADCP data (up- and down-looking) at the SHL2 station (centre of the main basin). Coordinates of the SHL2 station are 534700 and 144950 in the Swiss CH1903 coordinate system. The file with the CTD data contains the coordinates of the sample location (lat, lon), times (in MATLAB time), depths (in meters) and temperatures (in &deg;C).

+ +

All files are in MATLAB .mat format.

+
+
+
+
+
+ + + + https%3A%2F%2Fzenodo.org%2Foai2d + oai:zenodo.org:3234526 + 2020-03-19T10:58:08Z + + + + + false + false + 0.9 + + + + +
\ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl new file mode 100644 index 000000000..f815c0260 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl @@ -0,0 +1,432 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + OPEN + + + + + OPEN + + + + + RESTRICTED + + + + + UNKNOWN + + + + + + + + + + + + + + + DE + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite_orig.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite_orig.xsl new file mode 100644 index 000000000..d8b14fadd --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite_orig.xsl @@ -0,0 +1,472 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + OPEN + + + + + OPEN + + + + + RESTRICTED + + + + + UNKNOWN + + + + + + + + + + + + + + + DE + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire.xsl new file mode 100644 index 000000000..53a3466a9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire.xsl @@ -0,0 +1,82 @@ + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl new file mode 100644 index 000000000..56451505e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl @@ -0,0 +1,791 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid_orig.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid_orig.xsl new file mode 100644 index 000000000..3cfaec80b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid_orig.xsl @@ -0,0 +1,1081 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/zenodo_tr.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/zenodo_tr.xsl new file mode 100644 index 000000000..0c3f4b1f9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/zenodo_tr.xsl @@ -0,0 +1,451 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + OPEN + + + + + CLOSED + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/synonyms.txt b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/synonyms.txt new file mode 100644 index 000000000..729296522 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/synonyms.txt @@ -0,0 +1,1234 @@ +dnet:access_modes @=@ CLOSED @=@ http://purl.org/coar/access_right/c_14cb +dnet:access_modes @=@ CLOSED @=@ info:eu-repo/semantics/closedAccess +dnet:access_modes @=@ EMBARGO @=@ http://purl.org/coar/access_right/c_f1cf +dnet:access_modes @=@ EMBARGO @=@ info:eu-repo/semantics/embargoedAccess +dnet:access_modes @=@ OPEN @=@ Creative Commons License [CC BY-NC-ND] http://creativecommons.org/licenses/by-nc-nd/3.0/de/ +dnet:access_modes @=@ OPEN @=@ Creative commons +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by-nc-nd/3.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by-nc/3.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by-sa/3.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by-sa/4.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by/3.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by/3.0/us/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by/4.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/publicdomain/zero/1.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/publicdomain/zero/1.0/ & http://www.canadensys.net/norms +dnet:access_modes @=@ OPEN @=@ http://purl.org/coar/access_right/c_abf2 +dnet:access_modes @=@ OPEN @=@ https://creativecommons.org/licenses/by-nc/4.0/ +dnet:access_modes @=@ OPEN @=@ info:eu-repo/semantics/openAccess +dnet:access_modes @=@ OPEN @=@ open_access +dnet:access_modes @=@ RESTRICTED @=@ http://purl.org/coar/access_right/c_16ec +dnet:access_modes @=@ RESTRICTED @=@ info:eu-repo/semantics/restrictedAccess +dnet:compatibilityLevel @=@ openaire-pub_4.0 @=@ openaire4.0 +dnet:subject_classification_typologies @=@ jel @=@ jelElement +dnet:publication_resource @=@ 0018 @=@ Comment/debate +dnet:publication_resource @=@ 0018 @=@ http://purl.org/coar/resource_type/c_1162 +dnet:publication_resource @=@ 0018 @=@ info:eu-repo/semantics/annotation +dnet:publication_resource @=@ 0001 @=@ A1 Alkuperäisartikkeli tieteellisessä aikakauslehdessä +dnet:publication_resource @=@ 0001 @=@ Article +dnet:publication_resource @=@ 0001 @=@ Article (author) +dnet:publication_resource @=@ 0001 @=@ Article - letter to the editor +dnet:publication_resource @=@ 0001 @=@ Article / Letter to editor +dnet:publication_resource @=@ 0001 @=@ Article / Letter to the editor +dnet:publication_resource @=@ 0001 @=@ Article / Newspaper +dnet:publication_resource @=@ 0001 @=@ Article in journal +dnet:publication_resource @=@ 0001 @=@ Article in monograph or in proceedings +dnet:publication_resource @=@ 0001 @=@ Article in proceedings +dnet:publication_resource @=@ 0001 @=@ Article-letter to the editor +dnet:publication_resource @=@ 0001 @=@ Article/Letter to editor +dnet:publication_resource @=@ 0001 @=@ Articolo +dnet:publication_resource @=@ 0001 @=@ Artículo +dnet:publication_resource @=@ 0001 @=@ Aufsatz +dnet:publication_resource @=@ 0001 @=@ Clinical Study +dnet:publication_resource @=@ 0001 @=@ Institutional Series +dnet:publication_resource @=@ 0001 @=@ International Journal +dnet:publication_resource @=@ 0001 @=@ International Journal Abstract +dnet:publication_resource @=@ 0001 @=@ International Journal ISI/JCR +dnet:publication_resource @=@ 0001 @=@ Journal (full / special issue) +dnet:publication_resource @=@ 0001 @=@ Journal Article/Review +dnet:publication_resource @=@ 0001 @=@ Journal article +dnet:publication_resource @=@ 0001 @=@ Journal article (on-line or printed) +dnet:publication_resource @=@ 0001 @=@ Journal articles +dnet:publication_resource @=@ 0001 @=@ Journal paper +dnet:publication_resource @=@ 0001 @=@ National Journal +dnet:publication_resource @=@ 0001 @=@ Original article (non peer-reviewed) +dnet:publication_resource @=@ 0001 @=@ Original article (peer-reviewed) +dnet:publication_resource @=@ 0001 @=@ Peer-reviewed Article +dnet:publication_resource @=@ 0001 @=@ Published Journal Article +dnet:publication_resource @=@ 0001 @=@ Research Article +dnet:publication_resource @=@ 0001 @=@ Review article (non peer-reviewed) +dnet:publication_resource @=@ 0001 @=@ Review article (peer-reviewed) +dnet:publication_resource @=@ 0001 @=@ Volumes Edited / Special Issues +dnet:publication_resource @=@ 0001 @=@ article in non peer-reviewed journal +dnet:publication_resource @=@ 0001 @=@ article in peer-reviewed journal +dnet:publication_resource @=@ 0001 @=@ article-commentary +dnet:publication_resource @=@ 0001 @=@ article_site_web +dnet:publication_resource @=@ 0001 @=@ doc-type:Journal Article +dnet:publication_resource @=@ 0001 @=@ doc-type:article +dnet:publication_resource @=@ 0001 @=@ http://purl.org/coar/resource_type/c_2df8fbb1 +dnet:publication_resource @=@ 0001 @=@ http://purl.org/coar/resource_type/c_545b +dnet:publication_resource @=@ 0001 @=@ http://purl.org/coar/resource_type/c_6501 +dnet:publication_resource @=@ 0001 @=@ http://purl.org/coar/resource_type/c_7877 +dnet:publication_resource @=@ 0001 @=@ in-brief +dnet:publication_resource @=@ 0001 @=@ info:eu-repo/semantics/article +dnet:publication_resource @=@ 0001 @=@ journal-article +dnet:publication_resource @=@ 0001 @=@ journalArticle +dnet:publication_resource @=@ 0001 @=@ journal_article +dnet:publication_resource @=@ 0001 @=@ letter +dnet:publication_resource @=@ 0001 @=@ non peer-reviewed article +dnet:publication_resource @=@ 0001 @=@ partial-retraction +dnet:publication_resource @=@ 0001 @=@ proceeding with peer review +dnet:publication_resource @=@ 0001 @=@ publication-article +dnet:publication_resource @=@ 0001 @=@ rapid-communication +dnet:publication_resource @=@ 0001 @=@ reply +dnet:publication_resource @=@ 0001 @=@ research-article +dnet:publication_resource @=@ 0001 @=@ retraction +dnet:publication_resource @=@ 0001 @=@ review-article +dnet:publication_resource @=@ 0001 @=@ text (article) +dnet:publication_resource @=@ 0001 @=@ Статья +dnet:publication_resource @=@ 0001 @=@ ArticleArtikel +dnet:publication_resource @=@ 0033 @=@ AUDIOVISUAL_DOCUMENT +dnet:publication_resource @=@ 0033 @=@ Audiovisual/Audiovisual +dnet:publication_resource @=@ 0033 @=@ http://purl.org/coar/resource_type/c_c513 +dnet:publication_resource @=@ 0008 @=@ Bachelor's +dnet:publication_resource @=@ 0008 @=@ Bachelor's Degree +dnet:publication_resource @=@ 0008 @=@ Bachelors Thesis +dnet:publication_resource @=@ 0008 @=@ Proyecto fin de carrera +dnet:publication_resource @=@ 0008 @=@ Undergraduate Thesis +dnet:publication_resource @=@ 0008 @=@ http://purl.org/coar/resource_type/c_7a1f +dnet:publication_resource @=@ 0008 @=@ info:eu-repo/semantics/bachelorThesis +dnet:publication_resource @=@ 0008 @=@ выпускная бакалаврская работа +dnet:publication_resource @=@ 0002 @=@ Book (monograph) +dnet:publication_resource @=@ 0002 @=@ Book (non peer-reviewed) +dnet:publication_resource @=@ 0002 @=@ Book (peer-reviewed) +dnet:publication_resource @=@ 0002 @=@ Book - monograph - editorial book +dnet:publication_resource @=@ 0002 @=@ Book Section +dnet:publication_resource @=@ 0002 @=@ Book as author +dnet:publication_resource @=@ 0002 @=@ Buch +dnet:publication_resource @=@ 0002 @=@ International Book/Monograph +dnet:publication_resource @=@ 0002 @=@ Libro +dnet:publication_resource @=@ 0002 @=@ Monografia +dnet:publication_resource @=@ 0002 @=@ Monograph +dnet:publication_resource @=@ 0002 @=@ National Book/Monograph +dnet:publication_resource @=@ 0002 @=@ atlas +dnet:publication_resource @=@ 0002 @=@ book +dnet:publication_resource @=@ 0002 @=@ book-series +dnet:publication_resource @=@ 0002 @=@ book-set +dnet:publication_resource @=@ 0002 @=@ book-track +dnet:publication_resource @=@ 0002 @=@ book_series +dnet:publication_resource @=@ 0002 @=@ book_title +dnet:publication_resource @=@ 0002 @=@ doc-type:book +dnet:publication_resource @=@ 0002 @=@ edited-book +dnet:publication_resource @=@ 0002 @=@ http://purl.org/coar/resource_type/c_2f33 +dnet:publication_resource @=@ 0002 @=@ info:eu-repo/semantics/book +dnet:publication_resource @=@ 0002 @=@ ouvrage +dnet:publication_resource @=@ 0002 @=@ publication-book +dnet:publication_resource @=@ 0002 @=@ reference-book +dnet:publication_resource @=@ 0002 @=@ scientific book +dnet:publication_resource @=@ 0002 @=@ Монография +dnet:publication_resource @=@ 0002 @=@ Учебник +dnet:publication_resource @=@ 0037 @=@ clinicalTrial +dnet:publication_resource @=@ 0037 @=@ http://purl.org/coar/resource_type/c_cb28 +dnet:publication_resource @=@ 0022 @=@ collection +dnet:publication_resource @=@ 0004 @=@ A4 Artikkeli konferenssijulkaisussa +dnet:publication_resource @=@ 0004 @=@ Comunicación de congreso +dnet:publication_resource @=@ 0004 @=@ Conference Paper +dnet:publication_resource @=@ 0004 @=@ Conference Paper/Proceeding/Abstract +dnet:publication_resource @=@ 0004 @=@ Conference Proceedings +dnet:publication_resource @=@ 0004 @=@ Conference article +dnet:publication_resource @=@ 0004 @=@ Conference contribution +dnet:publication_resource @=@ 0004 @=@ Conference lecture +dnet:publication_resource @=@ 0004 @=@ Conference or Workshop Item +dnet:publication_resource @=@ 0004 @=@ Conference paper, poster, etc. +dnet:publication_resource @=@ 0004 @=@ Conference papers +dnet:publication_resource @=@ 0004 @=@ Conference report +dnet:publication_resource @=@ 0004 @=@ International Conference +dnet:publication_resource @=@ 0004 @=@ International Conference Abstract/Poster +dnet:publication_resource @=@ 0004 @=@ International Conference ISI/JCR +dnet:publication_resource @=@ 0004 @=@ International Conference communication/abstract/poster +dnet:publication_resource @=@ 0004 @=@ National Conference +dnet:publication_resource @=@ 0004 @=@ National Conference Abstract/Poster +dnet:publication_resource @=@ 0004 @=@ National Conference communication/abstract/poster +dnet:publication_resource @=@ 0004 @=@ PREFACE_PROCEEDINGS +dnet:publication_resource @=@ 0004 @=@ PROCEEDING_PAPER +dnet:publication_resource @=@ 0004 @=@ Papers in Conference Proceedings +dnet:publication_resource @=@ 0004 @=@ Presentación +dnet:publication_resource @=@ 0004 @=@ Proceedings (peer-reviewed) +dnet:publication_resource @=@ 0004 @=@ Proceedings of a Conference +dnet:publication_resource @=@ 0004 @=@ Proceedings paper +dnet:publication_resource @=@ 0004 @=@ Póster +dnet:publication_resource @=@ 0004 @=@ actes_congres +dnet:publication_resource @=@ 0004 @=@ communication_avec_actes +dnet:publication_resource @=@ 0004 @=@ communication_invitee +dnet:publication_resource @=@ 0004 @=@ communication_par_affiche +dnet:publication_resource @=@ 0004 @=@ communication_sans_actes +dnet:publication_resource @=@ 0004 @=@ conference +dnet:publication_resource @=@ 0004 @=@ conference item +dnet:publication_resource @=@ 0004 @=@ conference proceeding +dnet:publication_resource @=@ 0004 @=@ conferenceObject +dnet:publication_resource @=@ 0004 @=@ conference_paper +dnet:publication_resource @=@ 0004 @=@ doc-type:conferenceObject +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_18co +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_18cp +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_5794 +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_6670 +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_c94f +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_f744 +dnet:publication_resource @=@ 0004 @=@ info:eu-repo/semantics/conferenceItem +dnet:publication_resource @=@ 0004 @=@ info:eu-repo/semantics/conferenceObject +dnet:publication_resource @=@ 0004 @=@ invited conference talk +dnet:publication_resource @=@ 0004 @=@ poster +dnet:publication_resource @=@ 0004 @=@ presentation +dnet:publication_resource @=@ 0004 @=@ proceeding, seminar, workshop without peer review +dnet:publication_resource @=@ 0004 @=@ proceedings +dnet:publication_resource @=@ 0004 @=@ proceedings-article +dnet:publication_resource @=@ 0004 @=@ publication-conferencepaper +dnet:publication_resource @=@ 0004 @=@ научный доклад +dnet:publication_resource @=@ 0005 @=@ Newspaper or magazine article +dnet:publication_resource @=@ 0005 @=@ http://purl.org/coar/resource_type/c_998f +dnet:publication_resource @=@ 0005 @=@ info:eu-repo/semantics/contributionToPeriodical +dnet:publication_resource @=@ 0045 @=@ Data Management Plan +dnet:publication_resource @=@ 0045 @=@ Data Management Plan (NSF Generic) +dnet:publication_resource @=@ 0045 @=@ http://purl.org/coar/resource_type/c_ab20 +dnet:publication_resource @=@ 0045 @=@ http://purl.org/spar/fabio/DataManagementPolicy +dnet:publication_resource @=@ 0045 @=@ http://purl.org/spar/fabio/DataManagementPolicyDocument +dnet:publication_resource @=@ 0045 @=@ http://purl.org/spar/fabio/DataMangementPlan +dnet:publication_resource @=@ 0045 @=@ plan de gestión de datos +dnet:publication_resource @=@ 0045 @=@ publication-datamanagementplan +dnet:publication_resource @=@ 0031 @=@ Data Descriptor +dnet:publication_resource @=@ 0031 @=@ DataPaper +dnet:publication_resource @=@ 0031 @=@ data-article +dnet:publication_resource @=@ 0031 @=@ http://purl.org/coar/resource_type/c_beb9 +dnet:publication_resource @=@ 0021 @=@ Dataset/Dataset +dnet:publication_resource @=@ 0021 @=@ Research Data +dnet:publication_resource @=@ 0021 @=@ dataset +dnet:publication_resource @=@ 0021 @=@ http://purl.org/coar/resource_type/c_ddb1 +dnet:publication_resource @=@ 0021 @=@ info:eu-repo/semantics/DDIInstance +dnet:publication_resource @=@ 0021 @=@ info:eu-repo/semantics/datafile +dnet:publication_resource @=@ 0021 @=@ info:eu-repo/semantics/dataset +dnet:publication_resource @=@ 0021 @=@ info:eu-repo/semantics/enhancedObjectFile +dnet:publication_resource @=@ 0006 @=@ Diss +dnet:publication_resource @=@ 0006 @=@ Dissertation +dnet:publication_resource @=@ 0006 @=@ Doctoral +dnet:publication_resource @=@ 0006 @=@ DoctoralThesis +dnet:publication_resource @=@ 0006 @=@ PhD thesis +dnet:publication_resource @=@ 0006 @=@ Tesis +dnet:publication_resource @=@ 0006 @=@ Text.Thesis.Doctoral +dnet:publication_resource @=@ 0006 @=@ Theses +dnet:publication_resource @=@ 0006 @=@ Thesis +dnet:publication_resource @=@ 0006 @=@ Thesis or Dissertation +dnet:publication_resource @=@ 0006 @=@ Thesis.Doctoral +dnet:publication_resource @=@ 0006 @=@ doc-type:doctoralThesis +dnet:publication_resource @=@ 0006 @=@ http://purl.org/coar/resource_type/c_db06 +dnet:publication_resource @=@ 0006 @=@ info:eu-repo/semantics/doctoralThesis +dnet:publication_resource @=@ 0006 @=@ publication-thesis +dnet:publication_resource @=@ 0006 @=@ these +dnet:publication_resource @=@ 0006 @=@ these exercice +dnet:publication_resource @=@ 0023 @=@ Event/Event +dnet:publication_resource @=@ 0023 @=@ event +dnet:publication_resource @=@ 0009 @=@ Departmental Technical Report +dnet:publication_resource @=@ 0009 @=@ Informe Técnico +dnet:publication_resource @=@ 0009 @=@ RESEARCH_REPORT +dnet:publication_resource @=@ 0009 @=@ Tech-Report +dnet:publication_resource @=@ 0009 @=@ Technical Report +dnet:publication_resource @=@ 0009 @=@ http://purl.org/coar/resource_type/c_18gh +dnet:publication_resource @=@ 0009 @=@ publication-technicalnote +dnet:publication_resource @=@ 0009 @=@ research report +dnet:publication_resource @=@ 0024 @=@ Video +dnet:publication_resource @=@ 0024 @=@ film +dnet:publication_resource @=@ 0024 @=@ http://purl.org/coar/resource_type/c_12ce +dnet:publication_resource @=@ 0024 @=@ http://purl.org/coar/resource_type/c_8a7e +dnet:publication_resource @=@ 0025 @=@ Diagram +dnet:publication_resource @=@ 0025 @=@ Drawing +dnet:publication_resource @=@ 0025 @=@ Figure +dnet:publication_resource @=@ 0025 @=@ Image/Image +dnet:publication_resource @=@ 0025 @=@ Imagen +dnet:publication_resource @=@ 0025 @=@ Photo +dnet:publication_resource @=@ 0025 @=@ Plot +dnet:publication_resource @=@ 0025 @=@ fotó +dnet:publication_resource @=@ 0025 @=@ grafika +dnet:publication_resource @=@ 0025 @=@ http://purl.org/coar/resource_type/c_ecc8 +dnet:publication_resource @=@ 0025 @=@ image +dnet:publication_resource @=@ 0025 @=@ image-diagram +dnet:publication_resource @=@ 0025 @=@ image-drawing +dnet:publication_resource @=@ 0025 @=@ image-figure +dnet:publication_resource @=@ 0025 @=@ image-other +dnet:publication_resource @=@ 0025 @=@ image-photo +dnet:publication_resource @=@ 0025 @=@ image-plot +dnet:publication_resource @=@ 0026 @=@ http://purl.org/coar/resource_type/c_e9a0 +dnet:publication_resource @=@ 0026 @=@ interactiveResource +dnet:publication_resource @=@ 0011 @=@ Internal note +dnet:publication_resource @=@ 0011 @=@ http://purl.org/coar/resource_type/c_18ww +dnet:publication_resource @=@ 0043 @=@ http://purl.org/coar/resource_type/c_0640 +dnet:publication_resource @=@ 0010 @=@ Inaugural lecture +dnet:publication_resource @=@ 0010 @=@ Material didáctico +dnet:publication_resource @=@ 0010 @=@ Public-Lecture +dnet:publication_resource @=@ 0010 @=@ http://purl.org/coar/resource_type/c_8544 +dnet:publication_resource @=@ 0010 @=@ info:eu-repo/semantics/lecture +dnet:publication_resource @=@ 0010 @=@ lesson +dnet:publication_resource @=@ 0010 @=@ Учебный материал +dnet:publication_resource @=@ 0007 @=@ Diploma Project +dnet:publication_resource @=@ 0007 @=@ MSc Thesis +dnet:publication_resource @=@ 0007 @=@ Master Degree +dnet:publication_resource @=@ 0007 @=@ Master's +dnet:publication_resource @=@ 0007 @=@ Masterarbeit u.a. +dnet:publication_resource @=@ 0007 @=@ Masters (Taught) +dnet:publication_resource @=@ 0007 @=@ Masters thesis +dnet:publication_resource @=@ 0007 @=@ Masters-Thesis.Magister +dnet:publication_resource @=@ 0007 @=@ Tesina +dnet:publication_resource @=@ 0007 @=@ Thesis.Master +dnet:publication_resource @=@ 0007 @=@ Trabajo fin de Máster +dnet:publication_resource @=@ 0007 @=@ doc-type:masterThesis +dnet:publication_resource @=@ 0007 @=@ hdr +dnet:publication_resource @=@ 0007 @=@ http://purl.org/coar/resource_type/c_bdcc +dnet:publication_resource @=@ 0007 @=@ info:eu-repo/semantics/masterThesis +dnet:publication_resource @=@ 0007 @=@ masterThesis +dnet:publication_resource @=@ 0007 @=@ memoire +dnet:publication_resource @=@ 0027 @=@ Model/Model +dnet:publication_resource @=@ 0027 @=@ model +dnet:publication_resource @=@ 0020 @=@ Exhibition +dnet:publication_resource @=@ 0020 @=@ Learning Object +dnet:publication_resource @=@ 0020 @=@ Mapa +dnet:publication_resource @=@ 0020 @=@ Modelo de utilidad +dnet:publication_resource @=@ 0020 @=@ PEDAGOGICAL_DOCUMENT +dnet:publication_resource @=@ 0020 @=@ Partitura +dnet:publication_resource @=@ 0020 @=@ Sitio web +dnet:publication_resource @=@ 0020 @=@ Trabajo de divulgación +dnet:publication_resource @=@ 0020 @=@ Web publication/site +dnet:publication_resource @=@ 0020 @=@ application +dnet:publication_resource @=@ 0020 @=@ artefact +dnet:publication_resource @=@ 0020 @=@ carte +dnet:publication_resource @=@ 0020 @=@ composition +dnet:publication_resource @=@ 0020 @=@ document_audiovisuel +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_12cc +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_12cd +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_1843 +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_18cd +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_18cw +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_26e4 +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_7ad9 +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_e059 +dnet:publication_resource @=@ 0020 @=@ info:eu-repo/semantics/other +dnet:publication_resource @=@ 0020 @=@ learningObject +dnet:publication_resource @=@ 0020 @=@ map +dnet:publication_resource @=@ 0020 @=@ misc +dnet:publication_resource @=@ 0020 @=@ other +dnet:publication_resource @=@ 0020 @=@ revue +dnet:publication_resource @=@ 0038 @=@ Abstract +dnet:publication_resource @=@ 0038 @=@ Blog +dnet:publication_resource @=@ 0038 @=@ Book Prospectus +dnet:publication_resource @=@ 0038 @=@ Dictionary Entry +dnet:publication_resource @=@ 0038 @=@ Disclosure +dnet:publication_resource @=@ 0038 @=@ Editorial +dnet:publication_resource @=@ 0038 @=@ Editorial ISI/JCR +dnet:publication_resource @=@ 0038 @=@ Editors +dnet:publication_resource @=@ 0038 @=@ Editors (non peer-reviewed) +dnet:publication_resource @=@ 0038 @=@ Editors (peer-reviewed) +dnet:publication_resource @=@ 0038 @=@ Encyclopedia Entry +dnet:publication_resource @=@ 0038 @=@ Entrada de blog +dnet:publication_resource @=@ 0038 @=@ Funding Submission +dnet:publication_resource @=@ 0038 @=@ HabilitationThesis +dnet:publication_resource @=@ 0038 @=@ License +dnet:publication_resource @=@ 0038 @=@ Manual +dnet:publication_resource @=@ 0038 @=@ Manuscript +dnet:publication_resource @=@ 0038 @=@ Manuscrito +dnet:publication_resource @=@ 0038 @=@ Other publication (non peer-review) +dnet:publication_resource @=@ 0038 @=@ Other publication (peer-review) +dnet:publication_resource @=@ 0038 @=@ Revista +dnet:publication_resource @=@ 0038 @=@ Supervised Student Publication +dnet:publication_resource @=@ 0038 @=@ Tesis/trabajos de grado – Thesis +dnet:publication_resource @=@ 0038 @=@ Text +dnet:publication_resource @=@ 0038 @=@ Text/Text +dnet:publication_resource @=@ 0038 @=@ Trademark +dnet:publication_resource @=@ 0038 @=@ Translation +dnet:publication_resource @=@ 0038 @=@ afterword +dnet:publication_resource @=@ 0038 @=@ avantpropos +dnet:publication_resource @=@ 0038 @=@ bibliography +dnet:publication_resource @=@ 0038 @=@ chronique +dnet:publication_resource @=@ 0038 @=@ compte rendu +dnet:publication_resource @=@ 0038 @=@ correction +dnet:publication_resource @=@ 0038 @=@ foreword +dnet:publication_resource @=@ 0038 @=@ habilitation à diriger des recherches +dnet:publication_resource @=@ 0038 @=@ historicalDocument +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_0040 +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_0857 +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_18cf +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_18wz +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_3e5a +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_46ec +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_6947 +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_7acd +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_86bc +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_b239 +dnet:publication_resource @=@ 0038 @=@ note de lecture +dnet:publication_resource @=@ 0038 @=@ notedelecture +dnet:publication_resource @=@ 0038 @=@ other publication +dnet:publication_resource @=@ 0038 @=@ postface +dnet:publication_resource @=@ 0038 @=@ publication-other +dnet:publication_resource @=@ 0038 @=@ revuedepresse +dnet:publication_resource @=@ 0038 @=@ sa_component +dnet:publication_resource @=@ 0038 @=@ standard +dnet:publication_resource @=@ 0038 @=@ standard-series +dnet:publication_resource @=@ 0013 @=@ A3 Kirjan tai muun kokoomateoksen osa +dnet:publication_resource @=@ 0013 @=@ Book Part (author) +dnet:publication_resource @=@ 0013 @=@ Book Section / Chapter +dnet:publication_resource @=@ 0013 @=@ Book chapter or Essay in book +dnet:publication_resource @=@ 0013 @=@ Book editorial +dnet:publication_resource @=@ 0013 @=@ Book section +dnet:publication_resource @=@ 0013 @=@ Book_Chapter +dnet:publication_resource @=@ 0013 @=@ Buchbeitrag +dnet:publication_resource @=@ 0013 @=@ Capítulo de libro +dnet:publication_resource @=@ 0013 @=@ Contribution to International Book/Monograph +dnet:publication_resource @=@ 0013 @=@ Contribution to International Book/Monograph ISI/JCR +dnet:publication_resource @=@ 0013 @=@ Contribution to National Book/Monograph +dnet:publication_resource @=@ 0013 @=@ Contribution to book (non peer-reviewed) +dnet:publication_resource @=@ 0013 @=@ Contribution to book (peer-reviewed) +dnet:publication_resource @=@ 0013 @=@ Part of book - chapter +dnet:publication_resource @=@ 0013 @=@ book chapter +dnet:publication_resource @=@ 0013 @=@ book-part +dnet:publication_resource @=@ 0013 @=@ bookPart +dnet:publication_resource @=@ 0013 @=@ book_content +dnet:publication_resource @=@ 0013 @=@ chapitre_ouvrage +dnet:publication_resource @=@ 0013 @=@ chapter +dnet:publication_resource @=@ 0013 @=@ doc-type:bookPart +dnet:publication_resource @=@ 0013 @=@ http://purl.org/coar/resource_type/c_3248 +dnet:publication_resource @=@ 0013 @=@ info:eu-repo/semantics/bookPart +dnet:publication_resource @=@ 0013 @=@ publication-section +dnet:publication_resource @=@ 0013 @=@ reference-entry +dnet:publication_resource @=@ 0013 @=@ reference_entry +dnet:publication_resource @=@ 0013 @=@ scientific book chapter +dnet:publication_resource @=@ 0013 @=@ Глава монографии +dnet:publication_resource @=@ 0019 @=@ H1 Myönnetty patentti +dnet:publication_resource @=@ 0019 @=@ Patent +dnet:publication_resource @=@ 0019 @=@ Patente +dnet:publication_resource @=@ 0019 @=@ Solicitud de patente +dnet:publication_resource @=@ 0019 @=@ Traducción de patente +dnet:publication_resource @=@ 0019 @=@ brevet +dnet:publication_resource @=@ 0019 @=@ http://purl.org/coar/resource_type/c_15cd +dnet:publication_resource @=@ 0019 @=@ info:eu-repo/semantics/patent +dnet:publication_resource @=@ 0019 @=@ publication-patent +dnet:publication_resource @=@ 0028 @=@ Service +dnet:publication_resource @=@ 0028 @=@ physicalObject +dnet:publication_resource @=@ 0016 @=@ Pre Print +dnet:publication_resource @=@ 0016 @=@ Pre-print +dnet:publication_resource @=@ 0016 @=@ http://purl.org/coar/resource_type/c_816b +dnet:publication_resource @=@ 0016 @=@ info:eu-repo/semantics/preprint +dnet:publication_resource @=@ 0016 @=@ publication-preprint +dnet:publication_resource @=@ 0016 @=@ Препринт +dnet:publication_resource @=@ 0034 @=@ Project deliverable +dnet:publication_resource @=@ 0034 @=@ http://purl.org/coar/resource_type/c_18op +dnet:publication_resource @=@ 0034 @=@ publication-deliverable +dnet:publication_resource @=@ 0035 @=@ Project milestone +dnet:publication_resource @=@ 0035 @=@ publication-milestone +dnet:publication_resource @=@ 0036 @=@ Proposal +dnet:publication_resource @=@ 0036 @=@ http://purl.org/coar/resource_type/c_baaf +dnet:publication_resource @=@ 0036 @=@ research-proposal +dnet:publication_resource @=@ 0017 @=@ ACTIVITY_REPORT +dnet:publication_resource @=@ 0017 @=@ Commissioned report +dnet:publication_resource @=@ 0017 @=@ D4 Julkaistu kehittämis- tai tutkimusraportti tai -selvitys +dnet:publication_resource @=@ 0017 @=@ Deliverable +dnet:publication_resource @=@ 0017 @=@ Documento tecnico +dnet:publication_resource @=@ 0017 @=@ Project Report +dnet:publication_resource @=@ 0017 @=@ Software documentation +dnet:publication_resource @=@ 0017 @=@ brief-report +dnet:publication_resource @=@ 0017 @=@ case-report +dnet:publication_resource @=@ 0017 @=@ chapitre_rapport +dnet:publication_resource @=@ 0017 @=@ doc-type:report +dnet:publication_resource @=@ 0017 @=@ document_institutionnel +dnet:publication_resource @=@ 0017 @=@ document_technique +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_186u +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_18hj +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_18wq +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_18ws +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_71bd +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_93fc +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_ba1f +dnet:publication_resource @=@ 0017 @=@ info:eu-repo/semantics/report +dnet:publication_resource @=@ 0017 @=@ publication-report +dnet:publication_resource @=@ 0017 @=@ publication-softwaredocumentation +dnet:publication_resource @=@ 0017 @=@ rapport_expertise +dnet:publication_resource @=@ 0017 @=@ rapport_mission +dnet:publication_resource @=@ 0017 @=@ report +dnet:publication_resource @=@ 0017 @=@ report-paper +dnet:publication_resource @=@ 0017 @=@ report-paper_title +dnet:publication_resource @=@ 0017 @=@ report-series +dnet:publication_resource @=@ 0017 @=@ support_cours +dnet:publication_resource @=@ 0014 @=@ Arbeitspapier +dnet:publication_resource @=@ 0014 @=@ Departmental Bulletin Paper +dnet:publication_resource @=@ 0014 @=@ Documento de trabajo +dnet:publication_resource @=@ 0014 @=@ Paper +dnet:publication_resource @=@ 0014 @=@ Project description +dnet:publication_resource @=@ 0014 @=@ Research-Paper +dnet:publication_resource @=@ 0014 @=@ ResearchPaper +dnet:publication_resource @=@ 0014 @=@ Working / discussion paper +dnet:publication_resource @=@ 0014 @=@ Working Paper +dnet:publication_resource @=@ 0014 @=@ Working Paper / Technical Report +dnet:publication_resource @=@ 0014 @=@ doc-type:workingPaper +dnet:publication_resource @=@ 0014 @=@ http://purl.org/coar/resource_type/c_8042 +dnet:publication_resource @=@ 0014 @=@ info:eu-repo/semantics/paper +dnet:publication_resource @=@ 0014 @=@ info:eu-repo/semantics/workingPaper +dnet:publication_resource @=@ 0014 @=@ publication-workingpaper +dnet:publication_resource @=@ 0014 @=@ workingPaper +dnet:publication_resource @=@ 0015 @=@ A2 Katsausartikkeli tieteellisessä aikakauslehdessä +dnet:publication_resource @=@ 0015 @=@ Book Review +dnet:publication_resource @=@ 0015 @=@ Book/Film/Article review +dnet:publication_resource @=@ 0015 @=@ Literature review +dnet:publication_resource @=@ 0015 @=@ Peer review +dnet:publication_resource @=@ 0015 @=@ Reseña bibliográfica +dnet:publication_resource @=@ 0015 @=@ Review Article +dnet:publication_resource @=@ 0015 @=@ RezensionReview +dnet:publication_resource @=@ 0015 @=@ book-review +dnet:publication_resource @=@ 0015 @=@ http://purl.org/coar/resource_type/c_ba08 +dnet:publication_resource @=@ 0015 @=@ http://purl.org/coar/resource_type/c_dcae04bc +dnet:publication_resource @=@ 0015 @=@ http://purl.org/coar/resource_type/c_efa0 +dnet:publication_resource @=@ 0015 @=@ info:eu-repo/semantics/review +dnet:publication_resource @=@ 0015 @=@ peer-review +dnet:publication_resource @=@ 0029 @=@ Software +dnet:publication_resource @=@ 0029 @=@ Software/Software +dnet:publication_resource @=@ 0029 @=@ Workflow +dnet:publication_resource @=@ 0029 @=@ Workflow/Workflow +dnet:publication_resource @=@ 0029 @=@ http://purl.org/coar/resource_type/c_393c +dnet:publication_resource @=@ 0029 @=@ http://purl.org/coar/resource_type/c_5ce6 +dnet:publication_resource @=@ 0029 @=@ http://purl.org/coar/resource_type/c_c950 +dnet:publication_resource @=@ 0032 @=@ http://purl.org/coar/resource_type/c_7bab +dnet:publication_resource @=@ 0030 @=@ http://purl.org/coar/resource_type/c_18cc +dnet:publication_resource @=@ 0030 @=@ sound +dnet:publication_resource @=@ 0044 @=@ Graduate diploma +dnet:publication_resource @=@ 0044 @=@ Undergraduate diploma +dnet:publication_resource @=@ 0000 @=@ UNKNOWN +dnet:publication_resource @=@ 0042 @=@ EGI Virtual Appliance +dnet:languages @=@ abk @=@ ab +dnet:languages @=@ aar @=@ aa +dnet:languages @=@ afr @=@ af +dnet:languages @=@ alb/sqi @=@ sq +dnet:languages @=@ amh @=@ am +dnet:languages @=@ ara @=@ ar +dnet:languages @=@ arm/hye @=@ hy +dnet:languages @=@ asm @=@ as +dnet:languages @=@ ina @=@ ia +dnet:languages @=@ aym @=@ ay +dnet:languages @=@ aze @=@ az +dnet:languages @=@ bak @=@ ba +dnet:languages @=@ baq/eus @=@ eu +dnet:languages @=@ bel @=@ be +dnet:languages @=@ ben @=@ bn +dnet:languages @=@ bih @=@ bh +dnet:languages @=@ bis @=@ bi +dnet:languages @=@ bre @=@ br +dnet:languages @=@ bul @=@ bg +dnet:languages @=@ bur/mya @=@ my +dnet:languages @=@ cat @=@ ca +dnet:languages @=@ chi/zho @=@ zh +dnet:languages @=@ cos @=@ co +dnet:languages @=@ hrv @=@ hr +dnet:languages @=@ hrv @=@ hr +dnet:languages @=@ hrv @=@ scr/hrv +dnet:languages @=@ ces/cze @=@ cs +dnet:languages @=@ dan @=@ da +dnet:languages @=@ dut/nld @=@ dut/nla +dnet:languages @=@ dut/nld @=@ dutdut +dnet:languages @=@ dut/nld @=@ nl +dnet:languages @=@ dut/nld @=@ nl_be +dnet:languages @=@ dut/nld @=@ nl_nl +dnet:languages @=@ dut/nld @=@ nld +dnet:languages @=@ dzo @=@ dz +dnet:languages @=@ eng @=@ en +dnet:languages @=@ eng @=@ en_au +dnet:languages @=@ eng @=@ en_en +dnet:languages @=@ eng @=@ en_gb +dnet:languages @=@ eng @=@ en_nz +dnet:languages @=@ eng @=@ en_us +dnet:languages @=@ eng @=@ english +dnet:languages @=@ eng @=@ en-us +dnet:languages @=@ eng @=@ en-US +dnet:languages @=@ eng @=@ English +dnet:languages @=@ eng @=@ EN +dnet:languages @=@ eng @=@ en angielski +dnet:languages @=@ eng @=@ en-GB +dnet:languages @=@ eng @=@ Englisch +dnet:languages @=@ epo @=@ eo +dnet:languages @=@ est @=@ et +dnet:languages @=@ fao @=@ fo +dnet:languages @=@ fij @=@ fj +dnet:languages @=@ fin @=@ fi +dnet:languages @=@ fin @=@ Finnish +dnet:languages @=@ fra/fre @=@ fr +dnet:languages @=@ fra/fre @=@ FR +dnet:languages @=@ fra/fre @=@ fr_be +dnet:languages @=@ fra/fre @=@ fr_fr +dnet:languages @=@ fra/fre @=@ fre/fra +dnet:languages @=@ fra/fre @=@ fra +dnet:languages @=@ fry @=@ fy +dnet:languages @=@ glg @=@ gl +dnet:languages @=@ geo/kat @=@ ka +dnet:languages @=@ deu/ger @=@ de +dnet:languages @=@ deu/ger @=@ ger/deu +dnet:languages @=@ deu/ger @=@ german +dnet:languages @=@ deu/ger @=@ ger +dnet:languages @=@ deu/ger @=@ deu +dnet:languages @=@ deu/ger @=@ DE-de +dnet:languages @=@ ell/gre @=@ el +dnet:languages @=@ ell/gre @=@ gr +dnet:languages @=@ ell/gre @=@ el-GR +dnet:languages @=@ kal @=@ kl +dnet:languages @=@ grn @=@ gn +dnet:languages @=@ guj @=@ gu +dnet:languages @=@ hau @=@ ha +dnet:languages @=@ heb @=@ he +dnet:languages @=@ hin @=@ hi +dnet:languages @=@ hun @=@ hu +dnet:languages @=@ ice/isl @=@ is +dnet:languages @=@ ine @=@ - +dnet:languages @=@ ind @=@ id +dnet:languages @=@ iku @=@ iu +dnet:languages @=@ ipk @=@ ik +dnet:languages @=@ gai/iri @=@ ga +dnet:languages @=@ gai/iri @=@ gle +dnet:languages @=@ ita @=@ it +dnet:languages @=@ jpn @=@ ja +dnet:languages @=@ jav @=@ jv +dnet:languages @=@ jav @=@ jv/jw +dnet:languages @=@ jav @=@ jw +dnet:languages @=@ kan @=@ kn +dnet:languages @=@ kas @=@ ks +dnet:languages @=@ kaz @=@ kk +dnet:languages @=@ khm @=@ km +dnet:languages @=@ kin @=@ rw +dnet:languages @=@ kir @=@ ky +dnet:languages @=@ kor @=@ ko +dnet:languages @=@ kur @=@ ku +dnet:languages @=@ lao @=@ lo +dnet:languages @=@ lat @=@ la +dnet:languages @=@ lav @=@ lv +dnet:languages @=@ lin @=@ ln +dnet:languages @=@ lit @=@ lt +dnet:languages @=@ mac/mak @=@ mk +dnet:languages @=@ mlg @=@ mg +dnet:languages @=@ may/msa @=@ ms +dnet:languages @=@ mlt @=@ ml +dnet:languages @=@ mao/mri @=@ mi +dnet:languages @=@ mar @=@ mr +dnet:languages @=@ mol @=@ mo +dnet:languages @=@ mon @=@ mn +dnet:languages @=@ nau @=@ na +dnet:languages @=@ nep @=@ ne +dnet:languages @=@ nor @=@ no +dnet:languages @=@ oci @=@ oc +dnet:languages @=@ ori @=@ or +dnet:languages @=@ orm @=@ om +dnet:languages @=@ pan @=@ pa +dnet:languages @=@ fas/per @=@ fa +dnet:languages @=@ pol @=@ pl +dnet:languages @=@ por @=@ pt +dnet:languages @=@ por @=@ pt_pt +dnet:languages @=@ pus @=@ ps +dnet:languages @=@ que @=@ qu +dnet:languages @=@ roh @=@ rm +dnet:languages @=@ ron/rum @=@ ro +dnet:languages @=@ run @=@ rn +dnet:languages @=@ rus @=@ ru +dnet:languages @=@ smo @=@ sm +dnet:languages @=@ sag @=@ sg +dnet:languages @=@ san @=@ sa +dnet:languages @=@ srp @=@ scc/srp +dnet:languages @=@ srp @=@ sr +dnet:languages @=@ scr @=@ sh +dnet:languages @=@ sna @=@ sn +dnet:languages @=@ snd @=@ sd +dnet:languages @=@ sin @=@ si +dnet:languages @=@ sit @=@ - +dnet:languages @=@ slk/slo @=@ sk +dnet:languages @=@ slv @=@ sl +dnet:languages @=@ som @=@ so +dnet:languages @=@ sot @=@ st +dnet:languages @=@ esl/spa @=@ es +dnet:languages @=@ sun @=@ su +dnet:languages @=@ swa @=@ sw +dnet:languages @=@ ssw @=@ ss +dnet:languages @=@ swe @=@ sv +dnet:languages @=@ swe @=@ sve/swe +dnet:languages @=@ tgl @=@ tl +dnet:languages @=@ tgk @=@ tg +dnet:languages @=@ tam @=@ ta +dnet:languages @=@ tat @=@ tt +dnet:languages @=@ tel @=@ te +dnet:languages @=@ tha @=@ th +dnet:languages @=@ tha @=@ thai +dnet:languages @=@ bod/tib @=@ bo +dnet:languages @=@ tir @=@ ti +dnet:languages @=@ tog @=@ to +dnet:languages @=@ tso @=@ ts +dnet:languages @=@ tsn @=@ tn +dnet:languages @=@ tur @=@ tr +dnet:languages @=@ tuk @=@ tk +dnet:languages @=@ twi @=@ tw +dnet:languages @=@ uig @=@ ug +dnet:languages @=@ ukr @=@ uk +dnet:languages @=@ und @=@ UNKNOWN +dnet:languages @=@ und @=@ none +dnet:languages @=@ urd @=@ ur +dnet:languages @=@ uzb @=@ uz +dnet:languages @=@ vie @=@ vi +dnet:languages @=@ vol @=@ vo +dnet:languages @=@ wln @=@ wa +dnet:languages @=@ cym/wel @=@ cy +dnet:languages @=@ wol @=@ wo +dnet:languages @=@ xho @=@ xh +dnet:languages @=@ yid @=@ yi +dnet:languages @=@ yor @=@ yo +dnet:languages @=@ zha @=@ za +dnet:languages @=@ zul @=@ zu +dnet:result_typologies @=@ dataset @=@ 0021 +dnet:result_typologies @=@ dataset @=@ 0024 +dnet:result_typologies @=@ dataset @=@ 0025 +dnet:result_typologies @=@ dataset @=@ 0030 +dnet:result_typologies @=@ dataset @=@ 0033 +dnet:result_typologies @=@ dataset @=@ 0037 +dnet:result_typologies @=@ dataset @=@ 0039 +dnet:result_typologies @=@ dataset @=@ 0046 +dnet:result_typologies @=@ other @=@ 0000 +dnet:result_typologies @=@ other @=@ 0010 +dnet:result_typologies @=@ other @=@ 0018 +dnet:result_typologies @=@ other @=@ 0020 +dnet:result_typologies @=@ other @=@ 0022 +dnet:result_typologies @=@ other @=@ 0023 +dnet:result_typologies @=@ other @=@ 0026 +dnet:result_typologies @=@ other @=@ 0027 +dnet:result_typologies @=@ other @=@ 0028 +dnet:result_typologies @=@ other @=@ 0042 +dnet:result_typologies @=@ publication @=@ 0001 +dnet:result_typologies @=@ publication @=@ 0002 +dnet:result_typologies @=@ publication @=@ 0004 +dnet:result_typologies @=@ publication @=@ 0005 +dnet:result_typologies @=@ publication @=@ 0006 +dnet:result_typologies @=@ publication @=@ 0007 +dnet:result_typologies @=@ publication @=@ 0008 +dnet:result_typologies @=@ publication @=@ 0009 +dnet:result_typologies @=@ publication @=@ 0011 +dnet:result_typologies @=@ publication @=@ 0012 +dnet:result_typologies @=@ publication @=@ 0013 +dnet:result_typologies @=@ publication @=@ 0014 +dnet:result_typologies @=@ publication @=@ 0015 +dnet:result_typologies @=@ publication @=@ 0016 +dnet:result_typologies @=@ publication @=@ 0017 +dnet:result_typologies @=@ publication @=@ 0019 +dnet:result_typologies @=@ publication @=@ 0031 +dnet:result_typologies @=@ publication @=@ 0032 +dnet:result_typologies @=@ publication @=@ 0034 +dnet:result_typologies @=@ publication @=@ 0035 +dnet:result_typologies @=@ publication @=@ 0036 +dnet:result_typologies @=@ publication @=@ 0038 +dnet:result_typologies @=@ publication @=@ 0044 +dnet:result_typologies @=@ publication @=@ 0045 +dnet:result_typologies @=@ software @=@ 0029 +dnet:result_typologies @=@ software @=@ 0040 +dnet:countries @=@ AF @=@ AFG +dnet:countries @=@ AF @=@ Afghanistan +dnet:countries @=@ AD @=@ Andorra +dnet:countries @=@ AO @=@ Angola +dnet:countries @=@ AR @=@ ARG +dnet:countries @=@ AR @=@ Argentina +dnet:countries @=@ AU @=@ AUS +dnet:countries @=@ AU @=@ Australia +dnet:countries @=@ AT @=@ AUT +dnet:countries @=@ AT @=@ Austria +dnet:countries @=@ AZ @=@ AZE +dnet:countries @=@ BD @=@ Bangladesh +dnet:countries @=@ BY @=@ Belarus +dnet:countries @=@ BE @=@ BEL +dnet:countries @=@ BE @=@ Belgium +dnet:countries @=@ BJ @=@ BEN +dnet:countries @=@ BO @=@ Bolivia, Plurinational State of +dnet:countries @=@ BA @=@ BIH +dnet:countries @=@ BA @=@ Bosnia-Hercegovina +dnet:countries @=@ BR @=@ BRA +dnet:countries @=@ BR @=@ Brazil +dnet:countries @=@ BG @=@ Bulgaria +dnet:countries @=@ BF @=@ BFA +dnet:countries @=@ KH @=@ Cambodia +dnet:countries @=@ KH @=@ Cambogia +dnet:countries @=@ KH @=@ Campuchea +dnet:countries @=@ CM @=@ CMR +dnet:countries @=@ CA @=@ CAN +dnet:countries @=@ CA @=@ Canada +dnet:countries @=@ CV @=@ Cape Verde +dnet:countries @=@ CL @=@ CHL +dnet:countries @=@ CL @=@ Chile +dnet:countries @=@ CN @=@ CHN +dnet:countries @=@ CN @=@ China +dnet:countries @=@ CO @=@ COL +dnet:countries @=@ CO @=@ Colombia +dnet:countries @=@ CD @=@ Congo +dnet:countries @=@ CD @=@ Congo Democratic Republic (formerly Zaire) +dnet:countries @=@ CD @=@ Congo, Republic +dnet:countries @=@ CD @=@ Congo, the Democratic Republic of the +dnet:countries @=@ CD @=@ Zaire +dnet:countries @=@ CR @=@ CRI +dnet:countries @=@ CI @=@ CIV +dnet:countries @=@ CI @=@ Ivory Coast +dnet:countries @=@ HR @=@ Croatia +dnet:countries @=@ HR @=@ HRV +dnet:countries @=@ CY @=@ CYP +dnet:countries @=@ CY @=@ Cyprus +dnet:countries @=@ CZ @=@ CZE +dnet:countries @=@ CZ @=@ Czech Republic +dnet:countries @=@ CZ @=@ Czechia +dnet:countries @=@ CZ @=@ Czechoslovakia +dnet:countries @=@ DK @=@ DNK +dnet:countries @=@ DK @=@ Denmark +dnet:countries @=@ EC @=@ Ecuador +dnet:countries @=@ EG @=@ EGY +dnet:countries @=@ EG @=@ Egypt +dnet:countries @=@ SV @=@ SLV +dnet:countries @=@ EE @=@ EST +dnet:countries @=@ EE @=@ Estonia +dnet:countries @=@ ET @=@ ETH +dnet:countries @=@ EU @=@ EEC +dnet:countries @=@ FJ @=@ FJI +dnet:countries @=@ FI @=@ FIN +dnet:countries @=@ FI @=@ Finland +dnet:countries @=@ MK @=@ Macedonia +dnet:countries @=@ MK @=@ Macedonia, the Former Yugoslav Republic Of +dnet:countries @=@ MK @=@ North Macedonia +dnet:countries @=@ FR @=@ FRA +dnet:countries @=@ FR @=@ France +dnet:countries @=@ PF @=@ French Polynesia +dnet:countries @=@ PF @=@ PYF +dnet:countries @=@ TF @=@ French Southern Territories +dnet:countries @=@ GE @=@ Georgia +dnet:countries @=@ DE @=@ DEU +dnet:countries @=@ DE @=@ Germany +dnet:countries @=@ DE @=@ Germany, Berlin +dnet:countries @=@ GH @=@ GHA +dnet:countries @=@ GR @=@ EL +dnet:countries @=@ GR @=@ GRC +dnet:countries @=@ GL @=@ GRL +dnet:countries @=@ GN @=@ Guinea +dnet:countries @=@ GW @=@ Guinea-Bissau +dnet:countries @=@ VA @=@ Vatican State +dnet:countries @=@ HK @=@ HKG +dnet:countries @=@ HK @=@ Hong Kong +dnet:countries @=@ HK @=@ Hongkong +dnet:countries @=@ HU @=@ HUN +dnet:countries @=@ HU @=@ Hungary +dnet:countries @=@ IS @=@ ISL +dnet:countries @=@ IN @=@ IND +dnet:countries @=@ IN @=@ India +dnet:countries @=@ ID @=@ IDN +dnet:countries @=@ ID @=@ Indonesia +dnet:countries @=@ IR @=@ Iran +dnet:countries @=@ IR @=@ Iran, Islamic Republic of +dnet:countries @=@ IE @=@ IRL +dnet:countries @=@ IE @=@ Ireland +dnet:countries @=@ IL @=@ ISR +dnet:countries @=@ IL @=@ Israel +dnet:countries @=@ IT @=@ ITA +dnet:countries @=@ IT @=@ Italy +dnet:countries @=@ JM @=@ Jamaica +dnet:countries @=@ JP @=@ JPN +dnet:countries @=@ JP @=@ Japan +dnet:countries @=@ KZ @=@ KAZ +dnet:countries @=@ KZ @=@ Kazakistan +dnet:countries @=@ KZ @=@ Kazakstan +dnet:countries @=@ KE @=@ KEN +dnet:countries @=@ KE @=@ Kenya +dnet:countries @=@ KR @=@ KOR +dnet:countries @=@ KR @=@ Korea, Republic of +dnet:countries @=@ KR @=@ Korean Republic (South Korea) +dnet:countries @=@ KP @=@ PRK +dnet:countries @=@ LV @=@ LVA +dnet:countries @=@ LY @=@ Libya +dnet:countries @=@ LT @=@ LTU +dnet:countries @=@ LU @=@ LUX +dnet:countries @=@ LU @=@ Luxembourg +dnet:countries @=@ MO @=@ Macao +dnet:countries @=@ MG @=@ Madagascar +dnet:countries @=@ MY @=@ Malaysia +dnet:countries @=@ ML @=@ Mali +dnet:countries @=@ MT @=@ Malta +dnet:countries @=@ MU @=@ Mauritius +dnet:countries @=@ MX @=@ MEX +dnet:countries @=@ MX @=@ Mexico +dnet:countries @=@ FM @=@ Micronesia +dnet:countries @=@ MD @=@ Moldova +dnet:countries @=@ MD @=@ Moldova, Republic of +dnet:countries @=@ MN @=@ Mongolia +dnet:countries @=@ MA @=@ Morocco +dnet:countries @=@ MZ @=@ Mozambique +dnet:countries @=@ NA @=@ NAM +dnet:countries @=@ NL @=@ NLD +dnet:countries @=@ NL @=@ Netherlands +dnet:countries @=@ AN @=@ Netherlands Antilles +dnet:countries @=@ NC @=@ NCL +dnet:countries @=@ NZ @=@ NZL +dnet:countries @=@ NZ @=@ New Zealand +dnet:countries @=@ NO @=@ NOR +dnet:countries @=@ NO @=@ Norway +dnet:countries @=@ OC @=@ Australasia +dnet:countries @=@ OM @=@ Oman +dnet:countries @=@ PK @=@ PAK +dnet:countries @=@ PK @=@ Pakistan +dnet:countries @=@ PS @=@ Palestin, State of +dnet:countries @=@ PS @=@ Palestine, State of +dnet:countries @=@ PS @=@ Palestinian Territory, Occupied +dnet:countries @=@ PA @=@ PAN +dnet:countries @=@ PA @=@ Panama +dnet:countries @=@ PG @=@ PapuaNew Guinea +dnet:countries @=@ PE @=@ PER +dnet:countries @=@ PH @=@ PHL +dnet:countries @=@ PH @=@ Philippines +dnet:countries @=@ PL @=@ POL +dnet:countries @=@ PL @=@ Poland +dnet:countries @=@ PT @=@ PRT +dnet:countries @=@ PT @=@ Portugal +dnet:countries @=@ PR @=@ Puerto Rico +dnet:countries @=@ RO @=@ ROU +dnet:countries @=@ RO @=@ Romania +dnet:countries @=@ RU @=@ RUS +dnet:countries @=@ RU @=@ Russia +dnet:countries @=@ RU @=@ Russian Federation +dnet:countries @=@ RE @=@ Réunion +dnet:countries @=@ KN @=@ Saint Kitts And Nevis +dnet:countries @=@ SA @=@ Saudi Arabia +dnet:countries @=@ SN @=@ SEN +dnet:countries @=@ RS @=@ SRB +dnet:countries @=@ CS @=@ Serbia and Montenegro +dnet:countries @=@ SG @=@ SGP +dnet:countries @=@ SG @=@ Singapore +dnet:countries @=@ SK @=@ SVK +dnet:countries @=@ SI @=@ SVN +dnet:countries @=@ SI @=@ Slovenia +dnet:countries @=@ ZA @=@ South Africa +dnet:countries @=@ ZA @=@ ZAF +dnet:countries @=@ ES @=@ ESP +dnet:countries @=@ ES @=@ Spain +dnet:countries @=@ LK @=@ LKA +dnet:countries @=@ LK @=@ Sri Lanka +dnet:countries @=@ SD @=@ SDN +dnet:countries @=@ SR @=@ Suriname +dnet:countries @=@ SE @=@ SWE +dnet:countries @=@ SE @=@ Sweden +dnet:countries @=@ CH @=@ CHE +dnet:countries @=@ CH @=@ Switzerland +dnet:countries @=@ SY @=@ Syria +dnet:countries @=@ ST @=@ Sao Tome and Principe +dnet:countries @=@ TW @=@ TWN +dnet:countries @=@ TW @=@ Taiwan +dnet:countries @=@ TW @=@ Taiwan, Province of China +dnet:countries @=@ TZ @=@ Tanzania +dnet:countries @=@ TZ @=@ Tanzania, United Republic of +dnet:countries @=@ TH @=@ THA +dnet:countries @=@ TH @=@ Thailand +dnet:countries @=@ TL @=@ East Timor +dnet:countries @=@ TN @=@ TUN +dnet:countries @=@ TN @=@ Tunisia +dnet:countries @=@ TR @=@ TUR +dnet:countries @=@ TR @=@ Turkey +dnet:countries @=@ UNKNOWN @=@ AAA +dnet:countries @=@ UNKNOWN @=@ [Unknown] +dnet:countries @=@ UNKNOWN @=@ _? +dnet:countries @=@ UA @=@ UKR +dnet:countries @=@ UA @=@ Ukraine +dnet:countries @=@ AE @=@ United Arab Emirates +dnet:countries @=@ GB @=@ England +dnet:countries @=@ GB @=@ GBR +dnet:countries @=@ GB @=@ Great Britain +dnet:countries @=@ GB @=@ Great Britain and Northern Ireland +dnet:countries @=@ GB @=@ Scotland +dnet:countries @=@ GB @=@ UK +dnet:countries @=@ GB @=@ United Kingdom +dnet:countries @=@ US @=@ USA +dnet:countries @=@ US @=@ United States +dnet:countries @=@ US @=@ United States of America +dnet:countries @=@ UY @=@ Uruguay +dnet:countries @=@ UZ @=@ Uzbekistan +dnet:countries @=@ VE @=@ Venezuela, Bolivarian Republic of +dnet:countries @=@ VN @=@ Vietnam +dnet:countries @=@ VG @=@ British Virgin Islands +dnet:countries @=@ YU @=@ Jugoslavia +dnet:countries @=@ YU @=@ Yugoslavia +dnet:countries @=@ ZW @=@ ABW +dnet:protocols @=@ oai @=@ OAI-PMH +dnet:protocols @=@ oai @=@ OAI_PMH +dnet:pid_types @=@ orcid @=@ ORCID12 +dnet:pid_types @=@ handle @=@ hdl +dnet:review_levels @=@ 0000 @=@ UNKNOWN +dnet:review_levels @=@ 0002 @=@ 80 大阪経大学会「Working Paper」 +dnet:review_levels @=@ 0002 @=@ AO +dnet:review_levels @=@ 0002 @=@ ARTICLE SANS COMITE DE LECTURE (ASCL) +dnet:review_levels @=@ 0002 @=@ Arbeitspapier +dnet:review_levels @=@ 0002 @=@ Arbeitspapier [workingPaper] +dnet:review_levels @=@ 0002 @=@ Article (author) +dnet:review_levels @=@ 0002 @=@ Article type: preprint +dnet:review_levels @=@ 0002 @=@ Article(author version) +dnet:review_levels @=@ 0002 @=@ Article, not peer-reviewed +dnet:review_levels @=@ 0002 @=@ Articulo no evaluado +dnet:review_levels @=@ 0002 @=@ Artigo Solicitado e Não Avaliado por Pares +dnet:review_levels @=@ 0002 @=@ Artigo não avaliado pelos pares +dnet:review_levels @=@ 0002 @=@ Artigo não avaliado por pares +dnet:review_levels @=@ 0002 @=@ Artigo não avaliado por pres +dnet:review_levels @=@ 0002 @=@ Artikkeli|Artikkeli ammattilehdessä. Ei vertaisarvioitu +dnet:review_levels @=@ 0002 @=@ Artículo no evaluado +dnet:review_levels @=@ 0002 @=@ Book (non peer-reviewed) +dnet:review_levels @=@ 0002 @=@ Book Part (author) +dnet:review_levels @=@ 0002 @=@ Book item; Non-peer-reviewed +dnet:review_levels @=@ 0002 @=@ Conference preprint +dnet:review_levels @=@ 0002 @=@ Contribution to book (non peer-reviewed) +dnet:review_levels @=@ 0002 @=@ Discussion Paper +dnet:review_levels @=@ 0002 @=@ Document de travail (Working Paper) +dnet:review_levels @=@ 0002 @=@ Documento de trabajo +dnet:review_levels @=@ 0002 @=@ Documento de trabajo de investigaci??n +dnet:review_levels @=@ 0002 @=@ Draft +dnet:review_levels @=@ 0002 @=@ E-pub ahead of print +dnet:review_levels @=@ 0002 @=@ Editorial de revista, no evaluado por pares +dnet:review_levels @=@ 0002 @=@ Editorial de revista, não avaliado por pares +dnet:review_levels @=@ 0002 @=@ Editorial não avaliado pelos pares +dnet:review_levels @=@ 0002 @=@ Editors (non peer-reviewed) +dnet:review_levels @=@ 0002 @=@ Epub ahead of print +dnet:review_levels @=@ 0002 @=@ Hakemlik Sürecinden Geçmiş Makale +dnet:review_levels @=@ 0002 @=@ Hakemlik sürecindeki makale +dnet:review_levels @=@ 0002 @=@ Hakemlik sürecinden geçmemiş kitap değerlendirmesi +dnet:review_levels @=@ 0002 @=@ Journal Article (author version) +dnet:review_levels @=@ 0002 @=@ Journal Article Preprint +dnet:review_levels @=@ 0002 @=@ Journal Editorial, not peer-reviewed +dnet:review_levels @=@ 0002 @=@ Journal article; Non-peer-reviewed +dnet:review_levels @=@ 0002 @=@ Journal:WorkingPaper +dnet:review_levels @=@ 0002 @=@ Manuscript (preprint) +dnet:review_levels @=@ 0002 @=@ Monográfico (Informes, Documentos de trabajo, etc.) +dnet:review_levels @=@ 0002 @=@ NOTE INTERNE OU DE TRAVAIL +dnet:review_levels @=@ 0002 @=@ Nicht begutachteter Beitrag +dnet:review_levels @=@ 0002 @=@ No evaluado por pares +dnet:review_levels @=@ 0002 @=@ Non-Refereed +dnet:review_levels @=@ 0002 @=@ Non-refeered article +dnet:review_levels @=@ 0002 @=@ Non-refereed Article +dnet:review_levels @=@ 0002 @=@ Non-refereed Book Review +dnet:review_levels @=@ 0002 @=@ Non-refereed Review +dnet:review_levels @=@ 0002 @=@ Non-refereed Text +dnet:review_levels @=@ 0002 @=@ NonPeerReviewed +dnet:review_levels @=@ 0002 @=@ Not Peer reviewed +dnet:review_levels @=@ 0002 @=@ Not Reviewed +dnet:review_levels @=@ 0002 @=@ Not peer-reviewed +dnet:review_levels @=@ 0002 @=@ Não Avaliado por Pares +dnet:review_levels @=@ 0002 @=@ Não avaliada pelos pares +dnet:review_levels @=@ 0002 @=@ Não avaliado pelos pares +dnet:review_levels @=@ 0002 @=@ Original article (non peer-reviewed) +dnet:review_levels @=@ 0002 @=@ Other publication (non peer-review) +dnet:review_levels @=@ 0002 @=@ Pre Print +dnet:review_levels @=@ 0002 @=@ Pre-print +dnet:review_levels @=@ 0002 @=@ Preprint Article +dnet:review_levels @=@ 0002 @=@ Preprints +dnet:review_levels @=@ 0002 @=@ Preprints, Working Papers, ... +dnet:review_levels @=@ 0002 @=@ Rapporto tecnico / Working Paper / Rapporto di progetto +dnet:review_levels @=@ 0002 @=@ Resumo Não Avaliado por Pares +dnet:review_levels @=@ 0002 @=@ Review article (non peer-reviewed) +dnet:review_levels @=@ 0002 @=@ SMUR +dnet:review_levels @=@ 0002 @=@ Submissão dos artigos +dnet:review_levels @=@ 0002 @=@ Submitted version +dnet:review_levels @=@ 0002 @=@ Vertaisarvioimaton kirjan tai muun kokoomateoksen osa +dnet:review_levels @=@ 0002 @=@ Vorabdruck +dnet:review_levels @=@ 0002 @=@ Wetensch. publ. non-refereed +dnet:review_levels @=@ 0002 @=@ Working / discussion paper +dnet:review_levels @=@ 0002 @=@ Working Document +dnet:review_levels @=@ 0002 @=@ Working Notes +dnet:review_levels @=@ 0002 @=@ Working Paper +dnet:review_levels @=@ 0002 @=@ Working Paper / Technical Report +dnet:review_levels @=@ 0002 @=@ Working Papers +dnet:review_levels @=@ 0002 @=@ WorkingPaper +dnet:review_levels @=@ 0002 @=@ article in non peer-reviewed journal +dnet:review_levels @=@ 0002 @=@ articolo preliminare +dnet:review_levels @=@ 0002 @=@ articulo preliminar +dnet:review_levels @=@ 0002 @=@ articulo sin revision por pares +dnet:review_levels @=@ 0002 @=@ artigo preliminar +dnet:review_levels @=@ 0002 @=@ artigo sem revisão +dnet:review_levels @=@ 0002 @=@ artículo preliminar +dnet:review_levels @=@ 0002 @=@ artículo sin revisión por pares +dnet:review_levels @=@ 0002 @=@ bookchapter (author version) +dnet:review_levels @=@ 0002 @=@ borrador +dnet:review_levels @=@ 0002 @=@ column (author version) +dnet:review_levels @=@ 0002 @=@ communication_invitee +dnet:review_levels @=@ 0002 @=@ doc-type:preprint +dnet:review_levels @=@ 0002 @=@ doc-type:workingPaper +dnet:review_levels @=@ 0002 @=@ draf +dnet:review_levels @=@ 0002 @=@ eu-repo/semantics/submittedVersion +dnet:review_levels @=@ 0002 @=@ http://purl.org/coar/resource_type/c_8042 +dnet:review_levels @=@ 0002 @=@ http://purl.org/coar/resource_type/c_816b +dnet:review_levels @=@ 0002 @=@ http://purl.org/coar/version/c_71e4c1898caa6e32 +dnet:review_levels @=@ 0002 @=@ http://purl.org/coar/version/c_b1a7d7d4d402bcce +dnet:review_levels @=@ 0002 @=@ http://purl.org/eprint/type/SubmittedBookItem +dnet:review_levels @=@ 0002 @=@ http://purl.org/eprint/type/SubmittedJournalArticle +dnet:review_levels @=@ 0002 @=@ http://purl.org/info:eu-repo/semantics/authorVersion +dnet:review_levels @=@ 0002 @=@ http://purl.org/info:eu-repo/semantics/submittedVersion +dnet:review_levels @=@ 0002 @=@ http://purl.org/spar/fabio/Preprint +dnet:review_levels @=@ 0002 @=@ http://purl.org/spar/fabio/WorkingPaper +dnet:review_levels @=@ 0002 @=@ https://dictionary.casrai.org/Preprint +dnet:review_levels @=@ 0002 @=@ info:ar-repo/semantics/documento de trabajo +dnet:review_levels @=@ 0002 @=@ info:ar-repo/semantics/documentoDeTrabajo +dnet:review_levels @=@ 0002 @=@ info:eu repo/semantics/draft +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/authorVersion +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/draft +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/preprint +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/submitedVersion +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/submittedVersion +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/unReviewed +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/updatedVersion +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/workingPaper +dnet:review_levels @=@ 0002 @=@ info:eu-repo/submittedVersion +dnet:review_levels @=@ 0002 @=@ info:ulb-repo/semantics/articleNonPeerReview +dnet:review_levels @=@ 0002 @=@ info:ulb-repo/semantics/openurl/vlink-workingpaper +dnet:review_levels @=@ 0002 @=@ info:ulb-repo/semantics/workingPaper +dnet:review_levels @=@ 0002 @=@ non peer-reviewed article +dnet:review_levels @=@ 0002 @=@ non-refereed review article +dnet:review_levels @=@ 0002 @=@ não avaliado +dnet:review_levels @=@ 0002 @=@ preprint +dnet:review_levels @=@ 0002 @=@ prepublicación +dnet:review_levels @=@ 0002 @=@ proceeding, seminar, workshop without peer review +dnet:review_levels @=@ 0002 @=@ proceedings (author version) +dnet:review_levels @=@ 0002 @=@ pré-print +dnet:review_levels @=@ 0002 @=@ pré-publication +dnet:review_levels @=@ 0002 @=@ préprint +dnet:review_levels @=@ 0002 @=@ prépublication +dnet:review_levels @=@ 0002 @=@ publicació preliminar +dnet:review_levels @=@ 0002 @=@ publication-preprint +dnet:review_levels @=@ 0002 @=@ publication-workingpaper +dnet:review_levels @=@ 0002 @=@ submitedVersion +dnet:review_levels @=@ 0002 @=@ submittedVersion +dnet:review_levels @=@ 0002 @=@ voordruk +dnet:review_levels @=@ 0002 @=@ workingPaper +dnet:review_levels @=@ 0002 @=@ ön baskı +dnet:review_levels @=@ 0002 @=@ Препринт +dnet:review_levels @=@ 0002 @=@ предпечатная версия публикации +dnet:review_levels @=@ 0002 @=@ препринт статьи +dnet:review_levels @=@ 0002 @=@ ディスカッション/ワーキング・ペーパー DP/WP +dnet:review_levels @=@ 0002 @=@ プレプリント +dnet:review_levels @=@ 0002 @=@ プレプリント Preprint +dnet:review_levels @=@ 0002 @=@ プレプリント(Preprint) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-その他(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-テクニカルレポート類(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-会議発表論文(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-図書(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-学術雑誌論文(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-紀要論文(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-雑誌記事(査読無し) +dnet:review_levels @=@ 0002 @=@ 预印本 +dnet:review_levels @=@ 0001 @=@ ##rt.metadata.pkp.peerReviewed## +dnet:review_levels @=@ 0001 @=@ A1 Alkuperäisartikkeli tieteellisessä aikakauslehdessä +dnet:review_levels @=@ 0001 @=@ Art?culo revisado por pares +dnet:review_levels @=@ 0001 @=@ Article revisat per persones expertes +dnet:review_levels @=@ 0001 @=@ Article type: peer review +dnet:review_levels @=@ 0001 @=@ Article évalué par les pairs +dnet:review_levels @=@ 0001 @=@ Article évalué par des pairs +dnet:review_levels @=@ 0001 @=@ Article évalué par les pairs +dnet:review_levels @=@ 0001 @=@ Articolo valutato secondo i criteri della peer review +dnet:review_levels @=@ 0001 @=@ Articulo evaluado por dos pares +dnet:review_levels @=@ 0001 @=@ Articulo revisado por pares +dnet:review_levels @=@ 0001 @=@ Artigo Avaliado pelos Pares +dnet:review_levels @=@ 0001 @=@ Artigo Revisto por Pares +dnet:review_levels @=@ 0001 @=@ Artigo avaliado por blind peer review +dnet:review_levels @=@ 0001 @=@ Artigo avaliado por pares +dnet:review_levels @=@ 0001 @=@ Artigo de convidado. Avaliado pelos pares +dnet:review_levels @=@ 0001 @=@ Artigos; Avaliado pelos pares +dnet:review_levels @=@ 0001 @=@ Artículo de investigación, Investigaciones originales, Artículo evaluado por pares, Investigaciones empíricas +dnet:review_levels @=@ 0001 @=@ Artículo evaluado por pares +dnet:review_levels @=@ 0001 @=@ Artículo evaluado por pares, Ensayos de investigación +dnet:review_levels @=@ 0001 @=@ Artículo evaluado por pares, Investigaciones empíricas, Artículos de investigación +dnet:review_levels @=@ 0001 @=@ Artículo revisado +dnet:review_levels @=@ 0001 @=@ Artículo revisado por pares +dnet:review_levels @=@ 0001 @=@ Artículos de estudiantes, Artículo evaluado por pares, Artículos de investigación +dnet:review_levels @=@ 0001 @=@ Artículos de investigación evaluados por doble ciego +dnet:review_levels @=@ 0001 @=@ Artículos evaluadores por doble ciego +dnet:review_levels @=@ 0001 @=@ Artículos evaluados por pares +dnet:review_levels @=@ 0001 @=@ Artículos evaluados por pares académicos +dnet:review_levels @=@ 0001 @=@ Artículos revisados por pares +dnet:review_levels @=@ 0001 @=@ Avaliadas pelos pares +dnet:review_levels @=@ 0001 @=@ Avaliado anonimamente por pares +dnet:review_levels @=@ 0001 @=@ Avaliado em duplo cego por pares +dnet:review_levels @=@ 0001 @=@ Avaliado pela Editoria +dnet:review_levels @=@ 0001 @=@ Avaliado pela Editoria. Avaliado pelos pares. +dnet:review_levels @=@ 0001 @=@ Avaliado pelo Editoria +dnet:review_levels @=@ 0001 @=@ Avaliado pelo pares +dnet:review_levels @=@ 0001 @=@ Avaliado pelos Editores +dnet:review_levels @=@ 0001 @=@ Avaliado pelos pares +dnet:review_levels @=@ 0001 @=@ Avaliado pelos pares, Artigo de convidado +dnet:review_levels @=@ 0001 @=@ Avaliado pelos pares, Artigos Originais +dnet:review_levels @=@ 0001 @=@ Avaliado pelos pares, Artigos Originais, Artigos de Revisão +dnet:review_levels @=@ 0001 @=@ Avaliado pelos pares. Avaliado pelo Editoria +dnet:review_levels @=@ 0001 @=@ Avaliado po Pares +dnet:review_levels @=@ 0001 @=@ Avaliado por Editor +dnet:review_levels @=@ 0001 @=@ Avaliado por pares +dnet:review_levels @=@ 0001 @=@ Avaliados pelos pares +dnet:review_levels @=@ 0001 @=@ Avaliados por Pares +dnet:review_levels @=@ 0001 @=@ Blind Peer-reviewed Article +dnet:review_levels @=@ 0001 @=@ Book (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Comentario de libros, Comentario de revistas, Comentario de conferencias, Artículo evaluado por pares, Artículo de investigación +dnet:review_levels @=@ 0001 @=@ Conference paper; Peer-reviewed +dnet:review_levels @=@ 0001 @=@ Contribution to book (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Documento Avaliado por Pares +dnet:review_levels @=@ 0001 @=@ Double blind evaluation articles +dnet:review_levels @=@ 0001 @=@ Double blind peer review +dnet:review_levels @=@ 0001 @=@ Editors (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Evaluación por pares +dnet:review_levels @=@ 0001 @=@ Evaluado por pares +dnet:review_levels @=@ 0001 @=@ Evaluados por los pares +dnet:review_levels @=@ 0001 @=@ Hakem sürecinden geçmiş makale +dnet:review_levels @=@ 0001 @=@ Hakemli makale +dnet:review_levels @=@ 0001 @=@ Hakemlik Sürecinden Geçmiş +dnet:review_levels @=@ 0001 @=@ Invited Peer-Reviewed Article +dnet:review_levels @=@ 0001 @=@ Journal article; Peer-reviewed +dnet:review_levels @=@ 0001 @=@ Original article (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Other publication (peer-review) +dnet:review_levels @=@ 0001 @=@ Paper peer-reviewed +dnet:review_levels @=@ 0001 @=@ Papers evaluated by academic peers +dnet:review_levels @=@ 0001 @=@ Peer reviewed +dnet:review_levels @=@ 0001 @=@ Peer reviewed article +dnet:review_levels @=@ 0001 @=@ Peer reviewed invited commentry +dnet:review_levels @=@ 0001 @=@ Peer-Reviewed Protocol +dnet:review_levels @=@ 0001 @=@ Peer-reviewd Article +dnet:review_levels @=@ 0001 @=@ Peer-reviewed +dnet:review_levels @=@ 0001 @=@ Peer-reviewed Article +dnet:review_levels @=@ 0001 @=@ Peer-reviewed Paper +dnet:review_levels @=@ 0001 @=@ Peer-reviewed Review +dnet:review_levels @=@ 0001 @=@ Peer-reviewed Review Article +dnet:review_levels @=@ 0001 @=@ Peer-reviewed Text +dnet:review_levels @=@ 0001 @=@ Peer-reviewed communication +dnet:review_levels @=@ 0001 @=@ Peer-reviewed conference proceedings +dnet:review_levels @=@ 0001 @=@ Peer-reviewed research article +dnet:review_levels @=@ 0001 @=@ Peer-reviewed short communication +dnet:review_levels @=@ 0001 @=@ PeerReviewed +dnet:review_levels @=@ 0001 @=@ Proceedings (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Refereed +dnet:review_levels @=@ 0001 @=@ Refereed Article +dnet:review_levels @=@ 0001 @=@ Research articles evaluated by double blind +dnet:review_levels @=@ 0001 @=@ Resenha avaliada pelos pares +dnet:review_levels @=@ 0001 @=@ Review article (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Reviewed by peers +dnet:review_levels @=@ 0001 @=@ Revisión por Expertos +dnet:review_levels @=@ 0001 @=@ Revisto por Pares +dnet:review_levels @=@ 0001 @=@ SBBq abstracts / peer-reviewed +dnet:review_levels @=@ 0001 @=@ SBBq resúmenes - revisada por pares +dnet:review_levels @=@ 0001 @=@ Scholarly publ. Refereed +dnet:review_levels @=@ 0001 @=@ Scientific Publ (refereed) +dnet:review_levels @=@ 0001 @=@ Vertaisarvioimaton kirjoitus tieteellisessä aikakauslehdessä +dnet:review_levels @=@ 0001 @=@ Vertaisarvioitu alkuperäisartikkeli tieteellisessä aikakauslehdessä +dnet:review_levels @=@ 0001 @=@ Vertaisarvioitu artikkeli konferenssijulkaisussa +dnet:review_levels @=@ 0001 @=@ Vertaisarvioitu artikkeli tieteellisessä aikakauslehdessä +dnet:review_levels @=@ 0001 @=@ Vertaisarvioitu kirjan tai muun kokoomateoksen osa +dnet:review_levels @=@ 0001 @=@ Wetensch. publ. Refereed +dnet:review_levels @=@ 0001 @=@ article in peer-reviewed journal +dnet:review_levels @=@ 0001 @=@ articles validés +dnet:review_levels @=@ 0001 @=@ avaliado por pares, temas livres +dnet:review_levels @=@ 0001 @=@ info:eu-repo/semantics/peerReviewed +dnet:review_levels @=@ 0001 @=@ info:ulb-repo/semantics/articlePeerReview +dnet:review_levels @=@ 0001 @=@ proceeding with peer review +dnet:review_levels @=@ 0001 @=@ refereed_publications +dnet:review_levels @=@ 0001 @=@ ul_published_reviewed +dnet:review_levels @=@ 0001 @=@ Άρθρο που έχει αξιολογηθεί από ομότιμους ειδικούς +dnet:review_levels @=@ 0001 @=@ Άρθρο το οποίο έχει περάσει από ομότιμη αξιολόγηση +dnet:review_levels @=@ 0001 @=@ レフェリー付き論文 +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-テクニカルレポート類(査読有り) +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-会議発表論文(査読有り) +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-図書(査読有り) +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-学術雑誌論文(査読有り) +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-紀要論文(査読有り) +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-雑誌記事(査読有り) +dnet:review_levels @=@ 0001 @=@ 原著論文(査読有り) +dnet:review_levels @=@ 0001 @=@ 査読論文 \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/terms.txt b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/terms.txt new file mode 100644 index 000000000..93cc00eca --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/terms.txt @@ -0,0 +1,1080 @@ +ModularUiLabels @=@ ModularUiLabels @=@ PendingRepositoryResources @=@ Pending datasource +ModularUiLabels @=@ ModularUiLabels @=@ RepositoryServiceResources @=@ Valid datasource +dnet:content_description_typologies @=@ D-Net Content Description Typologies @=@ file::EuropePMC @=@ file::EuropePMC +dnet:content_description_typologies @=@ D-Net Content Description Typologies @=@ file::PDF @=@ file::PDF +dnet:content_description_typologies @=@ D-Net Content Description Typologies @=@ file::WoS @=@ file::WoS +dnet:content_description_typologies @=@ D-Net Content Description Typologies @=@ metadata @=@ metadata +dnet:content_description_typologies @=@ D-Net Content Description Typologies @=@ file::hybrid @=@ file::hybrid +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:cris @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:actionset:orcidworks-no-doi @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:infospace @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:aggregator @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:datasetarchive @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:actionset @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:entityregistry @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:repository @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:aggregator @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ community:subject @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ community:zenodocommunity @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ iis @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:entityregistry @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ community:organization @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:infospace @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:dedup @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ community:datasource @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ propagation:project:semrel @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:cris @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:repository @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:datasetarchive @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ community:semrel @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ user:claim @=@ Linked by user +dnet:provenanceActions @=@ dnet:provenanceActions @=@ user:claim:pid @=@ Linked by user +dnet:provenanceActions @=@ dnet:provenanceActions @=@ user:insert @=@ Linked by user +dnet:provenanceActions @=@ dnet:provenanceActions @=@ user:claim:search @=@ Linked by user +dnet:provenanceActions @=@ dnet:provenanceActions @=@ UNKNOWN @=@ UNKNOWN +dnet:provenanceActions @=@ dnet:provenanceActions @=@ country:instrepos @=@ Inferred by OpenAIRE +dnet:access_modes @=@ dnet:access_modes @=@ 12MONTHS @=@ 12 Months Embargo +dnet:access_modes @=@ dnet:access_modes @=@ 6MONTHS @=@ 6 Months Embargo +dnet:access_modes @=@ dnet:access_modes @=@ CLOSED @=@ Closed Access +dnet:access_modes @=@ dnet:access_modes @=@ EMBARGO @=@ Embargo +dnet:access_modes @=@ dnet:access_modes @=@ OPEN @=@ Open Access +dnet:access_modes @=@ dnet:access_modes @=@ OPEN SOURCE @=@ Open Source +dnet:access_modes @=@ dnet:access_modes @=@ OTHER @=@ Other +dnet:access_modes @=@ dnet:access_modes @=@ RESTRICTED @=@ Restricted +dnet:access_modes @=@ dnet:access_modes @=@ UNKNOWN @=@ not available +fct:funding_typologies @=@ fct:funding_typologies @=@ fct:program @=@ fct:program +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ openaire2.0 @=@ OpenAIRE 2.0 (EC funding) +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ openaire3.0 @=@ OpenAIRE 3.0 (OA, funding) +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ driver @=@ OpenAIRE Basic (DRIVER OA) +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ openaire-cris_1.1 @=@ OpenAIRE CRIS v1.1 +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ openaire2.0_data @=@ OpenAIRE Data (funded, referenced datasets) +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ openaire-pub_4.0 @=@ OpenAIRE PubRepos v4.0 +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ hostedBy @=@ collected from a compatible aggregator +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ files @=@ files +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ native @=@ native +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ UNKNOWN @=@ not available +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ notCompatible @=@ under validation +dnet:dataCite_date @=@ dnet:dataCite_date @=@ UNKNOWN @=@ UNKNOWN +dnet:dataCite_date @=@ dnet:dataCite_date @=@ available @=@ available +dnet:dataCite_date @=@ dnet:dataCite_date @=@ copyrighted @=@ copyrighted +dnet:dataCite_date @=@ dnet:dataCite_date @=@ created @=@ created +dnet:dataCite_date @=@ dnet:dataCite_date @=@ endDate @=@ endDate +dnet:dataCite_date @=@ dnet:dataCite_date @=@ issued @=@ issued +dnet:dataCite_date @=@ dnet:dataCite_date @=@ startDate @=@ startDate +dnet:dataCite_date @=@ dnet:dataCite_date @=@ submitted @=@ submitted +dnet:dataCite_date @=@ dnet:dataCite_date @=@ updated @=@ updated +dnet:dataCite_date @=@ dnet:dataCite_date @=@ valid @=@ valid +dnet:dataCite_date @=@ dnet:dataCite_date @=@ published-print @=@ published-print +dnet:dataCite_date @=@ dnet:dataCite_date @=@ published-online @=@ published-online +dnet:dataCite_date @=@ dnet:dataCite_date @=@ accepted @=@ accepted +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ crissystem @=@ CRIS System +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ datarepository::unknown @=@ Data Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ aggregator::datarepository @=@ Data Repository Aggregator +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry::projects @=@ Funder database +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ infospace @=@ Information Space +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubsrepository::institutional @=@ Institutional Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ aggregator::pubsrepository::institutional @=@ Institutional Repository Aggregator +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubsrepository::journal @=@ Journal +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ aggregator::pubsrepository::journals @=@ Journal Aggregator/Publisher +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubsrepository::mock @=@ Other +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubscatalogue::unknown @=@ Publication Catalogue +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubsrepository::unknown @=@ Publication Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ aggregator::pubsrepository::unknown @=@ Publication Repository Aggregator +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry @=@ Registry +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry::repositories @=@ Registry of repositories +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry::products @=@ Registry of research products +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry::researchers @=@ Registry of researchers +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry::organizations @=@ Registry of organizations +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ scholarcomminfra @=@ Scholarly Comm. Infrastructure +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ softwarerepository @=@ Software Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubsrepository::thematic @=@ Thematic Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ websource @=@ Web Source +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ aggregator::softwarerepository @=@ Software Repository Aggregator +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ orprepository @=@ Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ researchgraph @=@ Research Graph +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ ACM @=@ ACM Computing Classification System +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ agrovoc @=@ AGROVOC +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ bicssc @=@ BIC standard subject categories +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ DFG @=@ DFG Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ ddc @=@ Dewey Decimal Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ nsf:fieldOfApplication @=@ Field of Application (NSF) +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ gok @=@ Göttingen Online Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ ec:h2020topics @=@ Horizon 2020 Topics +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ IPC @=@ International Patent Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ jel @=@ JEL Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ lcsh @=@ Library of Congress Subject Headings +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ msc @=@ Mathematics Subject Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ mesheuropmc @=@ Medical Subject Headings +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ mesh @=@ Medical Subject Headings +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ bk @=@ Nederlandse basisclassificatie +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ dnet:od_subjects @=@ OpenDOAR subjects +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ ocis @=@ Optics Classification and Indexing Scheme +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ pacs @=@ Physics and Astronomy Classification Scheme +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ rvk @=@ Regensburger Verbundklassifikation +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ UNKNOWN @=@ UNKNOWN +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ udc @=@ Universal Decimal Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ wos @=@ Web of Science Subject Areas +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ arxiv @=@ arXiv +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ keyword @=@ keyword +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ MAG @=@ Microsoft Academic Graph classification +fct:contractTypes @=@ fct:contractTypes @=@ UNKNOWN @=@ UNKNOWN +dnet:publication_resource @=@ dnet:publication_resource @=@ 0018 @=@ Annotation +dnet:publication_resource @=@ dnet:publication_resource @=@ 0001 @=@ Article +dnet:publication_resource @=@ dnet:publication_resource @=@ 0033 @=@ Audiovisual +dnet:publication_resource @=@ dnet:publication_resource @=@ 0008 @=@ Bachelor thesis +dnet:publication_resource @=@ dnet:publication_resource @=@ 0046 @=@ Bioentity +dnet:publication_resource @=@ dnet:publication_resource @=@ 0002 @=@ Book +dnet:publication_resource @=@ dnet:publication_resource @=@ 0037 @=@ Clinical Trial +dnet:publication_resource @=@ dnet:publication_resource @=@ 0022 @=@ Collection +dnet:publication_resource @=@ dnet:publication_resource @=@ 0004 @=@ Conference object +dnet:publication_resource @=@ dnet:publication_resource @=@ 0005 @=@ Contribution for newspaper or weekly magazine +dnet:publication_resource @=@ dnet:publication_resource @=@ 0045 @=@ Data Management Plan +dnet:publication_resource @=@ dnet:publication_resource @=@ 0031 @=@ Data Paper +dnet:publication_resource @=@ dnet:publication_resource @=@ 0021 @=@ Dataset +dnet:publication_resource @=@ dnet:publication_resource @=@ 0006 @=@ Doctoral thesis +dnet:publication_resource @=@ dnet:publication_resource @=@ 0023 @=@ Event +dnet:publication_resource @=@ dnet:publication_resource @=@ 0009 @=@ External research report +dnet:publication_resource @=@ dnet:publication_resource @=@ 0024 @=@ Film +dnet:publication_resource @=@ dnet:publication_resource @=@ 0025 @=@ Image +dnet:publication_resource @=@ dnet:publication_resource @=@ 0026 @=@ InteractiveResource +dnet:publication_resource @=@ dnet:publication_resource @=@ 0011 @=@ Internal report +dnet:publication_resource @=@ dnet:publication_resource @=@ 0043 @=@ Journal +dnet:publication_resource @=@ dnet:publication_resource @=@ 0010 @=@ Lecture +dnet:publication_resource @=@ dnet:publication_resource @=@ 0007 @=@ Master thesis +dnet:publication_resource @=@ dnet:publication_resource @=@ 0027 @=@ Model +dnet:publication_resource @=@ dnet:publication_resource @=@ 0012 @=@ Newsletter +dnet:publication_resource @=@ dnet:publication_resource @=@ 0020 @=@ Other ORP type +dnet:publication_resource @=@ dnet:publication_resource @=@ 0039 @=@ Other dataset type +dnet:publication_resource @=@ dnet:publication_resource @=@ 0038 @=@ Other literature type +dnet:publication_resource @=@ dnet:publication_resource @=@ 0040 @=@ Other software type +dnet:publication_resource @=@ dnet:publication_resource @=@ 0013 @=@ Part of book or chapter of book +dnet:publication_resource @=@ dnet:publication_resource @=@ 0019 @=@ Patent +dnet:publication_resource @=@ dnet:publication_resource @=@ 0028 @=@ PhysicalObject +dnet:publication_resource @=@ dnet:publication_resource @=@ 0016 @=@ Preprint +dnet:publication_resource @=@ dnet:publication_resource @=@ 0034 @=@ Project deliverable +dnet:publication_resource @=@ dnet:publication_resource @=@ 0035 @=@ Project milestone +dnet:publication_resource @=@ dnet:publication_resource @=@ 0036 @=@ Project proposal +dnet:publication_resource @=@ dnet:publication_resource @=@ 0017 @=@ Report +dnet:publication_resource @=@ dnet:publication_resource @=@ 0014 @=@ Research +dnet:publication_resource @=@ dnet:publication_resource @=@ 0015 @=@ Review +dnet:publication_resource @=@ dnet:publication_resource @=@ 0029 @=@ Software +dnet:publication_resource @=@ dnet:publication_resource @=@ 0032 @=@ Software Paper +dnet:publication_resource @=@ dnet:publication_resource @=@ 0030 @=@ Sound +dnet:publication_resource @=@ dnet:publication_resource @=@ 0044 @=@ Thesis +dnet:publication_resource @=@ dnet:publication_resource @=@ 0000 @=@ Unknown +dnet:publication_resource @=@ dnet:publication_resource @=@ 0042 @=@ Virtual Appliance +ec:funding_typologies @=@ ec:funding_typologies @=@ ec:frameworkprogram @=@ frameworkprogram +ec:funding_typologies @=@ ec:funding_typologies @=@ ec:program @=@ program +ec:funding_typologies @=@ ec:funding_typologies @=@ ec:specificprogram @=@ specificprogram +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ 171 @=@ Article 171 of the Treaty +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ BSG @=@ Research for the benefit of specific groups +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ CIP-EIP-TN @=@ CIP-Eco-Innovation - CIP-Thematic Network +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ CP @=@ Collaborative project +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ CP-CSA @=@ Combination of CP & CSA +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ CSA @=@ Coordination and support action +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ ERC @=@ Support for frontier research (ERC) +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ MC @=@ Support for training and career development of researchers (Marie Curie) +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ NoE @=@ Network of Excellence +wt:funding_relations @=@ wt:funding_relations @=@ wt:hasParentFunding @=@ wt:hasParentFunding +dnet:languages @=@ dnet:languages @=@ abk @=@ Abkhazian +dnet:languages @=@ dnet:languages @=@ ace @=@ Achinese +dnet:languages @=@ dnet:languages @=@ ach @=@ Acoli +dnet:languages @=@ dnet:languages @=@ ada @=@ Adangme +dnet:languages @=@ dnet:languages @=@ aar @=@ Afar +dnet:languages @=@ dnet:languages @=@ afh @=@ Afrihili +dnet:languages @=@ dnet:languages @=@ afr @=@ Afrikaans +dnet:languages @=@ dnet:languages @=@ afa @=@ Afro-Asiatic +dnet:languages @=@ dnet:languages @=@ aka @=@ Akan +dnet:languages @=@ dnet:languages @=@ akk @=@ Akkadian +dnet:languages @=@ dnet:languages @=@ alb/sqi @=@ Albanian +dnet:languages @=@ dnet:languages @=@ ale @=@ Aleut +dnet:languages @=@ dnet:languages @=@ alg @=@ Algonquian languages +dnet:languages @=@ dnet:languages @=@ tut @=@ Altaic +dnet:languages @=@ dnet:languages @=@ amh @=@ Amharic +dnet:languages @=@ dnet:languages @=@ egy @=@ Ancient Egyptian +dnet:languages @=@ dnet:languages @=@ grc @=@ Ancient Greek +dnet:languages @=@ dnet:languages @=@ apa @=@ Apache +dnet:languages @=@ dnet:languages @=@ ara @=@ Arabic +dnet:languages @=@ dnet:languages @=@ arg @=@ Aragonese +dnet:languages @=@ dnet:languages @=@ arc @=@ Aramaic +dnet:languages @=@ dnet:languages @=@ arp @=@ Arapaho +dnet:languages @=@ dnet:languages @=@ arn @=@ Araucanian +dnet:languages @=@ dnet:languages @=@ arw @=@ Arawak +dnet:languages @=@ dnet:languages @=@ arm/hye @=@ Armenian +dnet:languages @=@ dnet:languages @=@ art @=@ Artificial +dnet:languages @=@ dnet:languages @=@ asm @=@ Assamese +dnet:languages @=@ dnet:languages @=@ ath @=@ Athapascan +dnet:languages @=@ dnet:languages @=@ map @=@ Austronesian +dnet:languages @=@ dnet:languages @=@ ina @=@ Auxiliary Language Association) +dnet:languages @=@ dnet:languages @=@ ava @=@ Avaric +dnet:languages @=@ dnet:languages @=@ ave @=@ Avestan +dnet:languages @=@ dnet:languages @=@ awa @=@ Awadhi +dnet:languages @=@ dnet:languages @=@ aym @=@ Aymara +dnet:languages @=@ dnet:languages @=@ aze @=@ Azerbaijani +dnet:languages @=@ dnet:languages @=@ nah @=@ Aztec +dnet:languages @=@ dnet:languages @=@ ban @=@ Balinese +dnet:languages @=@ dnet:languages @=@ bat @=@ Baltic +dnet:languages @=@ dnet:languages @=@ bal @=@ Baluchi +dnet:languages @=@ dnet:languages @=@ bam @=@ Bambara +dnet:languages @=@ dnet:languages @=@ bai @=@ Bamileke +dnet:languages @=@ dnet:languages @=@ bad @=@ Banda +dnet:languages @=@ dnet:languages @=@ bnt @=@ Bantu +dnet:languages @=@ dnet:languages @=@ bas @=@ Basa +dnet:languages @=@ dnet:languages @=@ bak @=@ Bashkir +dnet:languages @=@ dnet:languages @=@ baq/eus @=@ Basque +dnet:languages @=@ dnet:languages @=@ bej @=@ Beja +dnet:languages @=@ dnet:languages @=@ bel @=@ Belarusian +dnet:languages @=@ dnet:languages @=@ bem @=@ Bemba +dnet:languages @=@ dnet:languages @=@ ben @=@ Bengali +dnet:languages @=@ dnet:languages @=@ ber @=@ Berber +dnet:languages @=@ dnet:languages @=@ bho @=@ Bhojpuri +dnet:languages @=@ dnet:languages @=@ bih @=@ Bihari +dnet:languages @=@ dnet:languages @=@ bik @=@ Bikol +dnet:languages @=@ dnet:languages @=@ bin @=@ Bini +dnet:languages @=@ dnet:languages @=@ bis @=@ Bislama +dnet:languages @=@ dnet:languages @=@ nob @=@ Bokmål, Norwegian; Norwegian Bokmål +dnet:languages @=@ dnet:languages @=@ bos @=@ Bosnian +dnet:languages @=@ dnet:languages @=@ bra @=@ Braj +dnet:languages @=@ dnet:languages @=@ bre @=@ Breton +dnet:languages @=@ dnet:languages @=@ bug @=@ Buginese +dnet:languages @=@ dnet:languages @=@ bul @=@ Bulgarian +dnet:languages @=@ dnet:languages @=@ bua @=@ Buriat +dnet:languages @=@ dnet:languages @=@ bur/mya @=@ Burmese +dnet:languages @=@ dnet:languages @=@ cad @=@ Caddo +dnet:languages @=@ dnet:languages @=@ car @=@ Carib +dnet:languages @=@ dnet:languages @=@ cat @=@ Catalan; Valencian +dnet:languages @=@ dnet:languages @=@ cau @=@ Caucasian +dnet:languages @=@ dnet:languages @=@ ceb @=@ Cebuano +dnet:languages @=@ dnet:languages @=@ cel @=@ Celtic +dnet:languages @=@ dnet:languages @=@ cai @=@ Central American Indian +dnet:languages @=@ dnet:languages @=@ chg @=@ Chagatai +dnet:languages @=@ dnet:languages @=@ cha @=@ Chamorro +dnet:languages @=@ dnet:languages @=@ che @=@ Chechen +dnet:languages @=@ dnet:languages @=@ chr @=@ Cherokee +dnet:languages @=@ dnet:languages @=@ nya @=@ Chewa; Chichewa; Nyanja +dnet:languages @=@ dnet:languages @=@ chy @=@ Cheyenne +dnet:languages @=@ dnet:languages @=@ chb @=@ Chibcha +dnet:languages @=@ dnet:languages @=@ chi/zho @=@ Chinese +dnet:languages @=@ dnet:languages @=@ chn @=@ Chinook jargon +dnet:languages @=@ dnet:languages @=@ cho @=@ Choctaw +dnet:languages @=@ dnet:languages @=@ chu @=@ Church Slavic; Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic +dnet:languages @=@ dnet:languages @=@ chv @=@ Chuvash +dnet:languages @=@ dnet:languages @=@ cop @=@ Coptic +dnet:languages @=@ dnet:languages @=@ cor @=@ Cornish +dnet:languages @=@ dnet:languages @=@ cos @=@ Corsican +dnet:languages @=@ dnet:languages @=@ cre @=@ Cree +dnet:languages @=@ dnet:languages @=@ mus @=@ Creek +dnet:languages @=@ dnet:languages @=@ crp @=@ Creoles and Pidgins +dnet:languages @=@ dnet:languages @=@ hrv @=@ Croatian +dnet:languages @=@ dnet:languages @=@ cus @=@ Cushitic +dnet:languages @=@ dnet:languages @=@ ces/cze @=@ Czech +dnet:languages @=@ dnet:languages @=@ dak @=@ Dakota +dnet:languages @=@ dnet:languages @=@ dan @=@ Danish +dnet:languages @=@ dnet:languages @=@ del @=@ Delaware +dnet:languages @=@ dnet:languages @=@ din @=@ Dinka +dnet:languages @=@ dnet:languages @=@ div @=@ Divehi +dnet:languages @=@ dnet:languages @=@ doi @=@ Dogri +dnet:languages @=@ dnet:languages @=@ dra @=@ Dravidian +dnet:languages @=@ dnet:languages @=@ dua @=@ Duala +dnet:languages @=@ dnet:languages @=@ dut/nld @=@ Dutch; Flemish +dnet:languages @=@ dnet:languages @=@ dyu @=@ Dyula +dnet:languages @=@ dnet:languages @=@ dzo @=@ Dzongkha +dnet:languages @=@ dnet:languages @=@ efi @=@ Efik +dnet:languages @=@ dnet:languages @=@ eka @=@ Ekajuk +dnet:languages @=@ dnet:languages @=@ elx @=@ Elamite +dnet:languages @=@ dnet:languages @=@ eng @=@ English +dnet:languages @=@ dnet:languages @=@ cpe @=@ English-based Creoles and Pidgins +dnet:languages @=@ dnet:languages @=@ esk @=@ Eskimo +dnet:languages @=@ dnet:languages @=@ epo @=@ Esperanto +dnet:languages @=@ dnet:languages @=@ est @=@ Estonian +dnet:languages @=@ dnet:languages @=@ ewe @=@ Ewe +dnet:languages @=@ dnet:languages @=@ ewo @=@ Ewondo +dnet:languages @=@ dnet:languages @=@ fan @=@ Fang +dnet:languages @=@ dnet:languages @=@ fat @=@ Fanti +dnet:languages @=@ dnet:languages @=@ fao @=@ Faroese +dnet:languages @=@ dnet:languages @=@ fij @=@ Fijian +dnet:languages @=@ dnet:languages @=@ fin @=@ Finnish +dnet:languages @=@ dnet:languages @=@ fiu @=@ Finno-Ugrian +dnet:languages @=@ dnet:languages @=@ fon @=@ Fon +dnet:languages @=@ dnet:languages @=@ fra/fre @=@ French +dnet:languages @=@ dnet:languages @=@ cpf @=@ French-based Creoles and Pidgins +dnet:languages @=@ dnet:languages @=@ fry @=@ Frisian +dnet:languages @=@ dnet:languages @=@ ful @=@ Fulah +dnet:languages @=@ dnet:languages @=@ gaa @=@ Ga +dnet:languages @=@ dnet:languages @=@ gae/gdh @=@ Gaelic +dnet:languages @=@ dnet:languages @=@ gla @=@ Gaelic; Scottish Gaelic +dnet:languages @=@ dnet:languages @=@ glg @=@ Galician +dnet:languages @=@ dnet:languages @=@ lug @=@ Ganda +dnet:languages @=@ dnet:languages @=@ gay @=@ Gayo +dnet:languages @=@ dnet:languages @=@ gez @=@ Geez +dnet:languages @=@ dnet:languages @=@ geo/kat @=@ Georgian +dnet:languages @=@ dnet:languages @=@ deu/ger @=@ German +dnet:languages @=@ dnet:languages @=@ gem @=@ Germanic +dnet:languages @=@ dnet:languages @=@ kik @=@ Gikuyu; Kikuyu +dnet:languages @=@ dnet:languages @=@ gil @=@ Gilbertese +dnet:languages @=@ dnet:languages @=@ gon @=@ Gondi +dnet:languages @=@ dnet:languages @=@ got @=@ Gothic +dnet:languages @=@ dnet:languages @=@ grb @=@ Grebo +dnet:languages @=@ dnet:languages @=@ ell/gre @=@ Greek +dnet:languages @=@ dnet:languages @=@ gre/ell @=@ Greek, Modern (1453-) +dnet:languages @=@ dnet:languages @=@ kal @=@ Greenlandic; Kalaallisut +dnet:languages @=@ dnet:languages @=@ grn @=@ Guarani +dnet:languages @=@ dnet:languages @=@ guj @=@ Gujarati +dnet:languages @=@ dnet:languages @=@ hai @=@ Haida +dnet:languages @=@ dnet:languages @=@ hat @=@ Haitian; Haitian Creole +dnet:languages @=@ dnet:languages @=@ hau @=@ Hausa +dnet:languages @=@ dnet:languages @=@ haw @=@ Hawaiian +dnet:languages @=@ dnet:languages @=@ heb @=@ Hebrew +dnet:languages @=@ dnet:languages @=@ her @=@ Herero +dnet:languages @=@ dnet:languages @=@ hil @=@ Hiligaynon +dnet:languages @=@ dnet:languages @=@ him @=@ Himachali +dnet:languages @=@ dnet:languages @=@ hin @=@ Hindi +dnet:languages @=@ dnet:languages @=@ hmo @=@ Hiri Motu +dnet:languages @=@ dnet:languages @=@ hun @=@ Hungarian +dnet:languages @=@ dnet:languages @=@ hup @=@ Hupa +dnet:languages @=@ dnet:languages @=@ iba @=@ Iban +dnet:languages @=@ dnet:languages @=@ ice/isl @=@ Icelandic +dnet:languages @=@ dnet:languages @=@ ido @=@ Ido +dnet:languages @=@ dnet:languages @=@ ibo @=@ Igbo +dnet:languages @=@ dnet:languages @=@ ijo @=@ Ijo +dnet:languages @=@ dnet:languages @=@ ilo @=@ Iloko +dnet:languages @=@ dnet:languages @=@ inc @=@ Indic +dnet:languages @=@ dnet:languages @=@ ine @=@ Indo-European +dnet:languages @=@ dnet:languages @=@ ind @=@ Indonesian +dnet:languages @=@ dnet:languages @=@ ile @=@ Interlingue +dnet:languages @=@ dnet:languages @=@ iku @=@ Inuktitut +dnet:languages @=@ dnet:languages @=@ ipk @=@ Inupiaq +dnet:languages @=@ dnet:languages @=@ ira @=@ Iranian +dnet:languages @=@ dnet:languages @=@ gai/iri @=@ Irish +dnet:languages @=@ dnet:languages @=@ iro @=@ Iroquoian +dnet:languages @=@ dnet:languages @=@ ita @=@ Italian +dnet:languages @=@ dnet:languages @=@ jpn @=@ Japanese +dnet:languages @=@ dnet:languages @=@ jav @=@ Javanese +dnet:languages @=@ dnet:languages @=@ jrb @=@ Judeo-Arabic +dnet:languages @=@ dnet:languages @=@ jpr @=@ Judeo-Persian +dnet:languages @=@ dnet:languages @=@ kab @=@ Kabyle +dnet:languages @=@ dnet:languages @=@ kac @=@ Kachin +dnet:languages @=@ dnet:languages @=@ kam @=@ Kamba +dnet:languages @=@ dnet:languages @=@ kan @=@ Kannada +dnet:languages @=@ dnet:languages @=@ kau @=@ Kanuri +dnet:languages @=@ dnet:languages @=@ kaa @=@ Kara-Kalpak +dnet:languages @=@ dnet:languages @=@ kar @=@ Karen +dnet:languages @=@ dnet:languages @=@ kas @=@ Kashmiri +dnet:languages @=@ dnet:languages @=@ kaw @=@ Kawi +dnet:languages @=@ dnet:languages @=@ kaz @=@ Kazakh +dnet:languages @=@ dnet:languages @=@ kha @=@ Khasi +dnet:languages @=@ dnet:languages @=@ khm @=@ Khmer +dnet:languages @=@ dnet:languages @=@ khi @=@ Khoisan +dnet:languages @=@ dnet:languages @=@ kho @=@ Khotanese +dnet:languages @=@ dnet:languages @=@ kin @=@ Kinyarwanda +dnet:languages @=@ dnet:languages @=@ kir @=@ Kirghiz +dnet:languages @=@ dnet:languages @=@ kom @=@ Komi +dnet:languages @=@ dnet:languages @=@ kon @=@ Kongo +dnet:languages @=@ dnet:languages @=@ kok @=@ Konkani +dnet:languages @=@ dnet:languages @=@ kor @=@ Korean +dnet:languages @=@ dnet:languages @=@ kpe @=@ Kpelle +dnet:languages @=@ dnet:languages @=@ kro @=@ Kru +dnet:languages @=@ dnet:languages @=@ kua @=@ Kuanyama; Kwanyama +dnet:languages @=@ dnet:languages @=@ kum @=@ Kumyk +dnet:languages @=@ dnet:languages @=@ kur @=@ Kurdish +dnet:languages @=@ dnet:languages @=@ kru @=@ Kurukh +dnet:languages @=@ dnet:languages @=@ kus @=@ Kusaie +dnet:languages @=@ dnet:languages @=@ kut @=@ Kutenai +dnet:languages @=@ dnet:languages @=@ lad @=@ Ladino +dnet:languages @=@ dnet:languages @=@ lah @=@ Lahnda +dnet:languages @=@ dnet:languages @=@ lam @=@ Lamba +dnet:languages @=@ dnet:languages @=@ lao @=@ Lao +dnet:languages @=@ dnet:languages @=@ lat @=@ Latin +dnet:languages @=@ dnet:languages @=@ lav @=@ Latvian +dnet:languages @=@ dnet:languages @=@ ltz @=@ Letzeburgesch; Luxembourgish +dnet:languages @=@ dnet:languages @=@ lez @=@ Lezghian +dnet:languages @=@ dnet:languages @=@ lim @=@ Limburgan; Limburger; Limburgish +dnet:languages @=@ dnet:languages @=@ lin @=@ Lingala +dnet:languages @=@ dnet:languages @=@ lit @=@ Lithuanian +dnet:languages @=@ dnet:languages @=@ loz @=@ Lozi +dnet:languages @=@ dnet:languages @=@ lub @=@ Luba-Katanga +dnet:languages @=@ dnet:languages @=@ lui @=@ Luiseno +dnet:languages @=@ dnet:languages @=@ lun @=@ Lunda +dnet:languages @=@ dnet:languages @=@ luo @=@ Luo +dnet:languages @=@ dnet:languages @=@ mac/mak @=@ Macedonian +dnet:languages @=@ dnet:languages @=@ mad @=@ Madurese +dnet:languages @=@ dnet:languages @=@ mag @=@ Magahi +dnet:languages @=@ dnet:languages @=@ mai @=@ Maithili +dnet:languages @=@ dnet:languages @=@ mak @=@ Makasar +dnet:languages @=@ dnet:languages @=@ mlg @=@ Malagasy +dnet:languages @=@ dnet:languages @=@ may/msa @=@ Malay +dnet:languages @=@ dnet:languages @=@ mal @=@ Malayalam +dnet:languages @=@ dnet:languages @=@ mlt @=@ Maltese +dnet:languages @=@ dnet:languages @=@ man @=@ Mandingo +dnet:languages @=@ dnet:languages @=@ mni @=@ Manipuri +dnet:languages @=@ dnet:languages @=@ mno @=@ Manobo +dnet:languages @=@ dnet:languages @=@ glv @=@ Manx +dnet:languages @=@ dnet:languages @=@ mao/mri @=@ Maori +dnet:languages @=@ dnet:languages @=@ mar @=@ Marathi +dnet:languages @=@ dnet:languages @=@ chm @=@ Mari +dnet:languages @=@ dnet:languages @=@ mah @=@ Marshallese +dnet:languages @=@ dnet:languages @=@ mwr @=@ Marwari +dnet:languages @=@ dnet:languages @=@ mas @=@ Masai +dnet:languages @=@ dnet:languages @=@ myn @=@ Mayan +dnet:languages @=@ dnet:languages @=@ men @=@ Mende +dnet:languages @=@ dnet:languages @=@ mic @=@ Micmac +dnet:languages @=@ dnet:languages @=@ dum @=@ Middle Dutch +dnet:languages @=@ dnet:languages @=@ enm @=@ Middle English +dnet:languages @=@ dnet:languages @=@ frm @=@ Middle French +dnet:languages @=@ dnet:languages @=@ gmh @=@ Middle High German +dnet:languages @=@ dnet:languages @=@ mga @=@ Middle Irish +dnet:languages @=@ dnet:languages @=@ min @=@ Minangkabau +dnet:languages @=@ dnet:languages @=@ mis @=@ Miscellaneous +dnet:languages @=@ dnet:languages @=@ moh @=@ Mohawk +dnet:languages @=@ dnet:languages @=@ mol @=@ Moldavian +dnet:languages @=@ dnet:languages @=@ mkh @=@ Mon-Kmer +dnet:languages @=@ dnet:languages @=@ lol @=@ Mongo +dnet:languages @=@ dnet:languages @=@ mon @=@ Mongolian +dnet:languages @=@ dnet:languages @=@ mos @=@ Mossi +dnet:languages @=@ dnet:languages @=@ mul @=@ Multiple languages +dnet:languages @=@ dnet:languages @=@ mun @=@ Munda +dnet:languages @=@ dnet:languages @=@ nau @=@ Nauru +dnet:languages @=@ dnet:languages @=@ nav @=@ Navajo; Navaho +dnet:languages @=@ dnet:languages @=@ nde @=@ Ndebele, North +dnet:languages @=@ dnet:languages @=@ nbl @=@ Ndebele, South +dnet:languages @=@ dnet:languages @=@ ndo @=@ Ndonga +dnet:languages @=@ dnet:languages @=@ nep @=@ Nepali +dnet:languages @=@ dnet:languages @=@ new @=@ Newari +dnet:languages @=@ dnet:languages @=@ nic @=@ Niger-Kordofanian +dnet:languages @=@ dnet:languages @=@ ssa @=@ Nilo-Saharan +dnet:languages @=@ dnet:languages @=@ niu @=@ Niuean +dnet:languages @=@ dnet:languages @=@ non @=@ Norse +dnet:languages @=@ dnet:languages @=@ nai @=@ North American Indian +dnet:languages @=@ dnet:languages @=@ sme @=@ Northern Sami +dnet:languages @=@ dnet:languages @=@ nor @=@ Norwegian +dnet:languages @=@ dnet:languages @=@ nno @=@ Norwegian Nynorsk; Nynorsk, Norwegian +dnet:languages @=@ dnet:languages @=@ nub @=@ Nubian +dnet:languages @=@ dnet:languages @=@ nym @=@ Nyamwezi +dnet:languages @=@ dnet:languages @=@ nyn @=@ Nyankole +dnet:languages @=@ dnet:languages @=@ nyo @=@ Nyoro +dnet:languages @=@ dnet:languages @=@ nzi @=@ Nzima +dnet:languages @=@ dnet:languages @=@ oci @=@ Occitan (post 1500); Provençal +dnet:languages @=@ dnet:languages @=@ oji @=@ Ojibwa +dnet:languages @=@ dnet:languages @=@ ang @=@ Old English +dnet:languages @=@ dnet:languages @=@ fro @=@ Old French +dnet:languages @=@ dnet:languages @=@ goh @=@ Old High German +dnet:languages @=@ dnet:languages @=@ ori @=@ Oriya +dnet:languages @=@ dnet:languages @=@ orm @=@ Oromo +dnet:languages @=@ dnet:languages @=@ osa @=@ Osage +dnet:languages @=@ dnet:languages @=@ oss @=@ Ossetian; Ossetic +dnet:languages @=@ dnet:languages @=@ oto @=@ Otomian +dnet:languages @=@ dnet:languages @=@ ota @=@ Ottoman +dnet:languages @=@ dnet:languages @=@ pal @=@ Pahlavi +dnet:languages @=@ dnet:languages @=@ pau @=@ Palauan +dnet:languages @=@ dnet:languages @=@ pli @=@ Pali +dnet:languages @=@ dnet:languages @=@ pam @=@ Pampanga +dnet:languages @=@ dnet:languages @=@ pag @=@ Pangasinan +dnet:languages @=@ dnet:languages @=@ pan @=@ Panjabi; Punjabi +dnet:languages @=@ dnet:languages @=@ pap @=@ Papiamento +dnet:languages @=@ dnet:languages @=@ paa @=@ Papuan-Australian +dnet:languages @=@ dnet:languages @=@ fas/per @=@ Persian +dnet:languages @=@ dnet:languages @=@ peo @=@ Persian, Old (ca 600 - 400 B.C.) +dnet:languages @=@ dnet:languages @=@ phn @=@ Phoenician +dnet:languages @=@ dnet:languages @=@ pol @=@ Polish +dnet:languages @=@ dnet:languages @=@ pon @=@ Ponape +dnet:languages @=@ dnet:languages @=@ por @=@ Portuguese +dnet:languages @=@ dnet:languages @=@ cpp @=@ Portuguese-based Creoles and Pidgins +dnet:languages @=@ dnet:languages @=@ pra @=@ Prakrit +dnet:languages @=@ dnet:languages @=@ pro @=@ Provencal +dnet:languages @=@ dnet:languages @=@ pus @=@ Pushto +dnet:languages @=@ dnet:languages @=@ que @=@ Quechua +dnet:languages @=@ dnet:languages @=@ roh @=@ Raeto-Romance +dnet:languages @=@ dnet:languages @=@ raj @=@ Rajasthani +dnet:languages @=@ dnet:languages @=@ rar @=@ Rarotongan +dnet:languages @=@ dnet:languages @=@ roa @=@ Romance +dnet:languages @=@ dnet:languages @=@ ron/rum @=@ Romanian +dnet:languages @=@ dnet:languages @=@ rom @=@ Romany +dnet:languages @=@ dnet:languages @=@ run @=@ Rundi +dnet:languages @=@ dnet:languages @=@ rus @=@ Russian +dnet:languages @=@ dnet:languages @=@ sal @=@ Salishan +dnet:languages @=@ dnet:languages @=@ sam @=@ Samaritan +dnet:languages @=@ dnet:languages @=@ smi @=@ Sami +dnet:languages @=@ dnet:languages @=@ smo @=@ Samoan +dnet:languages @=@ dnet:languages @=@ sad @=@ Sandawe +dnet:languages @=@ dnet:languages @=@ sag @=@ Sango +dnet:languages @=@ dnet:languages @=@ san @=@ Sanskrit +dnet:languages @=@ dnet:languages @=@ srd @=@ Sardinian +dnet:languages @=@ dnet:languages @=@ sco @=@ Scots +dnet:languages @=@ dnet:languages @=@ sel @=@ Selkup +dnet:languages @=@ dnet:languages @=@ sem @=@ Semitic +dnet:languages @=@ dnet:languages @=@ srp @=@ Serbian +dnet:languages @=@ dnet:languages @=@ scr @=@ Serbo-Croatian +dnet:languages @=@ dnet:languages @=@ srr @=@ Serer +dnet:languages @=@ dnet:languages @=@ shn @=@ Shan +dnet:languages @=@ dnet:languages @=@ sna @=@ Shona +dnet:languages @=@ dnet:languages @=@ iii @=@ Sichuan Yi +dnet:languages @=@ dnet:languages @=@ sid @=@ Sidamo +dnet:languages @=@ dnet:languages @=@ bla @=@ Siksika +dnet:languages @=@ dnet:languages @=@ snd @=@ Sindhi +dnet:languages @=@ dnet:languages @=@ sin @=@ Sinhala; Sinhalese +dnet:languages @=@ dnet:languages @=@ sit @=@ Sino-Tibetan +dnet:languages @=@ dnet:languages @=@ sio @=@ Siouan +dnet:languages @=@ dnet:languages @=@ sla @=@ Slavic +dnet:languages @=@ dnet:languages @=@ slk/slo @=@ Slovak +dnet:languages @=@ dnet:languages @=@ slv @=@ Slovenian +dnet:languages @=@ dnet:languages @=@ sog @=@ Sogdian +dnet:languages @=@ dnet:languages @=@ som @=@ Somali +dnet:languages @=@ dnet:languages @=@ son @=@ Songhai +dnet:languages @=@ dnet:languages @=@ wen @=@ Sorbian +dnet:languages @=@ dnet:languages @=@ nso @=@ Sotho +dnet:languages @=@ dnet:languages @=@ sot @=@ Sotho, Southern +dnet:languages @=@ dnet:languages @=@ sai @=@ South American Indian +dnet:languages @=@ dnet:languages @=@ esl/spa @=@ Spanish +dnet:languages @=@ dnet:languages @=@ spa @=@ Spanish; Castilian +dnet:languages @=@ dnet:languages @=@ suk @=@ Sukuma +dnet:languages @=@ dnet:languages @=@ sux @=@ Sumerian +dnet:languages @=@ dnet:languages @=@ sun @=@ Sundanese +dnet:languages @=@ dnet:languages @=@ sus @=@ Susu +dnet:languages @=@ dnet:languages @=@ swa @=@ Swahili +dnet:languages @=@ dnet:languages @=@ ssw @=@ Swati +dnet:languages @=@ dnet:languages @=@ swe @=@ Swedish +dnet:languages @=@ dnet:languages @=@ syr @=@ Syriac +dnet:languages @=@ dnet:languages @=@ tgl @=@ Tagalog +dnet:languages @=@ dnet:languages @=@ tah @=@ Tahitian +dnet:languages @=@ dnet:languages @=@ tgk @=@ Tajik +dnet:languages @=@ dnet:languages @=@ tmh @=@ Tamashek +dnet:languages @=@ dnet:languages @=@ tam @=@ Tamil +dnet:languages @=@ dnet:languages @=@ tat @=@ Tatar +dnet:languages @=@ dnet:languages @=@ tel @=@ Telugu +dnet:languages @=@ dnet:languages @=@ ter @=@ Tereno +dnet:languages @=@ dnet:languages @=@ tha @=@ Thai +dnet:languages @=@ dnet:languages @=@ bod/tib @=@ Tibetan +dnet:languages @=@ dnet:languages @=@ tig @=@ Tigre +dnet:languages @=@ dnet:languages @=@ tir @=@ Tigrinya +dnet:languages @=@ dnet:languages @=@ tem @=@ Timne +dnet:languages @=@ dnet:languages @=@ tiv @=@ Tivi +dnet:languages @=@ dnet:languages @=@ tli @=@ Tlingit +dnet:languages @=@ dnet:languages @=@ ton @=@ Tonga (Tonga Islands) +dnet:languages @=@ dnet:languages @=@ tog @=@ Tonga(Nyasa) +dnet:languages @=@ dnet:languages @=@ tru @=@ Truk +dnet:languages @=@ dnet:languages @=@ tsi @=@ Tsimshian +dnet:languages @=@ dnet:languages @=@ tso @=@ Tsonga +dnet:languages @=@ dnet:languages @=@ tsn @=@ Tswana +dnet:languages @=@ dnet:languages @=@ tum @=@ Tumbuka +dnet:languages @=@ dnet:languages @=@ tur @=@ Turkish +dnet:languages @=@ dnet:languages @=@ tuk @=@ Turkmen +dnet:languages @=@ dnet:languages @=@ tyv @=@ Tuvinian +dnet:languages @=@ dnet:languages @=@ twi @=@ Twi +dnet:languages @=@ dnet:languages @=@ uga @=@ Ugaritic +dnet:languages @=@ dnet:languages @=@ uig @=@ Uighur; Uyghur +dnet:languages @=@ dnet:languages @=@ ukr @=@ Ukrainian +dnet:languages @=@ dnet:languages @=@ umb @=@ Umbundu +dnet:languages @=@ dnet:languages @=@ und @=@ Undetermined +dnet:languages @=@ dnet:languages @=@ urd @=@ Urdu +dnet:languages @=@ dnet:languages @=@ uzb @=@ Uzbek +dnet:languages @=@ dnet:languages @=@ vai @=@ Vai +dnet:languages @=@ dnet:languages @=@ ven @=@ Venda +dnet:languages @=@ dnet:languages @=@ vie @=@ Vietnamese +dnet:languages @=@ dnet:languages @=@ vol @=@ Volapük +dnet:languages @=@ dnet:languages @=@ vot @=@ Votic +dnet:languages @=@ dnet:languages @=@ wak @=@ Wakashan +dnet:languages @=@ dnet:languages @=@ wal @=@ Walamo +dnet:languages @=@ dnet:languages @=@ wln @=@ Walloon +dnet:languages @=@ dnet:languages @=@ war @=@ Waray +dnet:languages @=@ dnet:languages @=@ was @=@ Washo +dnet:languages @=@ dnet:languages @=@ cym/wel @=@ Welsh +dnet:languages @=@ dnet:languages @=@ wol @=@ Wolof +dnet:languages @=@ dnet:languages @=@ xho @=@ Xhosa +dnet:languages @=@ dnet:languages @=@ sah @=@ Yakut +dnet:languages @=@ dnet:languages @=@ yao @=@ Yao +dnet:languages @=@ dnet:languages @=@ yap @=@ Yap +dnet:languages @=@ dnet:languages @=@ yid @=@ Yiddish +dnet:languages @=@ dnet:languages @=@ yor @=@ Yoruba +dnet:languages @=@ dnet:languages @=@ zap @=@ Zapotec +dnet:languages @=@ dnet:languages @=@ zen @=@ Zenaga +dnet:languages @=@ dnet:languages @=@ zha @=@ Zhuang; Chuang +dnet:languages @=@ dnet:languages @=@ zul @=@ Zulu +dnet:languages @=@ dnet:languages @=@ zun @=@ Zuni +dnet:languages @=@ dnet:languages @=@ sga @=@ old Irish +nsf:contractTypes @=@ NSF Contract Types @=@ BOA/Task Order @=@ BOA/Task Order +nsf:contractTypes @=@ NSF Contract Types @=@ Continuing grant @=@ Continuing grant +nsf:contractTypes @=@ NSF Contract Types @=@ Contract @=@ Contract +nsf:contractTypes @=@ NSF Contract Types @=@ Contract Interagency Agreement @=@ Contract Interagency Agreement +nsf:contractTypes @=@ NSF Contract Types @=@ Cooperative Agreement @=@ Cooperative Agreement +nsf:contractTypes @=@ NSF Contract Types @=@ Fellowship @=@ Fellowship +nsf:contractTypes @=@ NSF Contract Types @=@ Fixed Price Award @=@ Fixed Price Award +nsf:contractTypes @=@ NSF Contract Types @=@ GAA @=@ GAA +nsf:contractTypes @=@ NSF Contract Types @=@ Interagency Agreement @=@ Interagency Agreement +nsf:contractTypes @=@ NSF Contract Types @=@ Intergovernmental Personnel Award @=@ Intergovernmental Personnel Award +nsf:contractTypes @=@ NSF Contract Types @=@ Personnel Agreement @=@ Personnel Agreement +nsf:contractTypes @=@ NSF Contract Types @=@ Standard Grant @=@ Standard Grant +ec:funding_relations @=@ ec:funding_relations @=@ ec:hasframeworkprogram @=@ hasframeworkprogram +ec:funding_relations @=@ ec:funding_relations @=@ ec:hasprogram @=@ hasprogram +ec:funding_relations @=@ ec:funding_relations @=@ ec:hasspecificprogram @=@ hasspecificprogram +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ UNKNOWN @=@ UNKNOWN +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ collection @=@ collection +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ dataset @=@ dataset +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ event @=@ event +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ film @=@ film +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ image @=@ image +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ interactiveResource @=@ interactiveResource +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ model @=@ model +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ physicalObject @=@ physicalObject +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ service @=@ service +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ software @=@ software +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ sound @=@ sound +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ text @=@ text +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ clinicalTrial @=@ Clinical trial +dnet:dataCite_title @=@ dnet:dataCite_title @=@ alternative title @=@ alternative title +dnet:dataCite_title @=@ dnet:dataCite_title @=@ main title @=@ main title +dnet:dataCite_title @=@ dnet:dataCite_title @=@ subtitle @=@ subtitle +dnet:dataCite_title @=@ dnet:dataCite_title @=@ translated title @=@ translated title +datacite:relation_typologies @=@ datacite:relation_typologies @=@ IsCitedBy @=@ IsCitedBy +datacite:relation_typologies @=@ datacite:relation_typologies @=@ IsNewVersionOf @=@ IsNewVersionOf +datacite:relation_typologies @=@ datacite:relation_typologies @=@ IsPartOf @=@ IsPartOf +datacite:relation_typologies @=@ datacite:relation_typologies @=@ IsPreviousVersionOf @=@ IsPreviousVersionOf +datacite:relation_typologies @=@ datacite:relation_typologies @=@ IsReferencedBy @=@ IsReferencedBy +datacite:relation_typologies @=@ datacite:relation_typologies @=@ References @=@ References +datacite:relation_typologies @=@ datacite:relation_typologies @=@ UNKNOWN @=@ UNKNOWN +dnet:result_typologies @=@ dnet:result_typologies @=@ dataset @=@ dataset +dnet:result_typologies @=@ dnet:result_typologies @=@ other @=@ other +dnet:result_typologies @=@ dnet:result_typologies @=@ publication @=@ publication +dnet:result_typologies @=@ dnet:result_typologies @=@ software @=@ software +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-ADG @=@ Advanced Grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ BBI-CSA @=@ Bio-based Industries Coordination and Support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ BBI-IA-DEMO @=@ Bio-based Industries Innovation action - Demonstration +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ BBI-IA-FLAG @=@ Bio-based Industries Innovation action - Flagship +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ BBI-RIA @=@ Bio-based Industries Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-IF-EF-CAR @=@ CAR – Career Restart panel +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ COFUND-EJP @=@ COFUND (European Joint Programme) +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ COFUND-PCP @=@ COFUND (PCP) +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ COFUND-PPI @=@ COFUND (PPI) +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ CS2-CSA @=@ CS2 Coordination and Support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ CS2-IA @=@ CS2 Innovation Action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ CS2-RIA @=@ CS2 Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ CSA-LS @=@ CSA Lump sum +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-COG @=@ Consolidator Grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ FCH2-CSA @=@ Coordination & support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ CSA @=@ Coordination and support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-COFUND-DP @=@ Doctoral programmes +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ECSEL-CSA @=@ ECSEL Coordination & Support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ECSEL-IA @=@ ECSEL Innovation Action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ECSEL-RIA @=@ ECSEL Research and Innovation Actions +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERA-NET-Cofund @=@ ERA-NET Cofund +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-POC-LS @=@ ERC Proof of Concept Lump Sum Pilot +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-SyG @=@ ERC Synergy Grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-LVG @=@ ERC low value grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ H2020-EEN-SGA @=@ Enterprise Europe Network - Specific Grant Agreement +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-ITN-EID @=@ European Industrial Doctorates +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-ITN-EJD @=@ European Joint Doctorates +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-ITN-ETN @=@ European Training Networks +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ FCH2-IA @=@ FCH2 Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ FCH2-RIA @=@ FCH2 Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-COFUND-FP @=@ Fellowship programmes +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-IF-GF @=@ Global Fellowships +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ IMI2-CSA @=@ IMI2 Coordination & support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ IMI2-RIA @=@ IMI2 Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ Shift2Rail-IA-LS @=@ Innovation Action Lump-Sum +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ IA-LS @=@ Innovation Action Lump-Sum +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ IA @=@ Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ Shift2Rail-IA @=@ Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ PCP @=@ Pre-Commercial Procurement +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-POC @=@ Proof of Concept Grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ PPI @=@ Public Procurement of Innovative Solutions +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-IF-EF-RI @=@ RI – Reintegration panel +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-RISE @=@ RISE +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ Shift2Rail-RIA-LS @=@ Research and Innovation Action Lump-Sum +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ Shift2Rail-RIA @=@ Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ RIA @=@ Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ RIA-LS @=@ Research and Innovation action Lump Sum +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SESAR-CSA @=@ SESAR: Coordination and Support Action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SESAR-IA @=@ SESAR: Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SESAR-RIA @=@ SESAR: Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SGA-RIA @=@ SGA Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SME-2b @=@ SME Instrument (grant only and blended finance) +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SME-1 @=@ SME instrument phase 1 +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SME-2 @=@ SME instrument phase 2 +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ Shift2Rail-CSA @=@ Shift2Rail - Coordination and Support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-IF-EF-SE @=@ Society and Enterprise panel +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SGA-CSA @=@ Specific Grant agreement and Coordination and Support Action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-IF-EF-ST @=@ Standard EF +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-STG @=@ Starting Grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-SNLS @=@ Grant to identified beneficiary - Coordination and support actions (MSCA-Special Needs lump sum) +wt:contractTypes @=@ wt:contractTypes @=@ UNKNOWN @=@ UNKNOWN +dnet:countries @=@ dnet:countries @=@ AF @=@ Afghanistan +dnet:countries @=@ dnet:countries @=@ AL @=@ Albania +dnet:countries @=@ dnet:countries @=@ DZ @=@ Algeria +dnet:countries @=@ dnet:countries @=@ AS @=@ American Samoa +dnet:countries @=@ dnet:countries @=@ AD @=@ Andorra +dnet:countries @=@ dnet:countries @=@ AO @=@ Angola +dnet:countries @=@ dnet:countries @=@ AI @=@ Anguilla +dnet:countries @=@ dnet:countries @=@ AQ @=@ Antarctica +dnet:countries @=@ dnet:countries @=@ AG @=@ Antigua and Barbuda +dnet:countries @=@ dnet:countries @=@ AR @=@ Argentina +dnet:countries @=@ dnet:countries @=@ AM @=@ Armenia +dnet:countries @=@ dnet:countries @=@ AW @=@ Aruba +dnet:countries @=@ dnet:countries @=@ AU @=@ Australia +dnet:countries @=@ dnet:countries @=@ AT @=@ Austria +dnet:countries @=@ dnet:countries @=@ AZ @=@ Azerbaijan +dnet:countries @=@ dnet:countries @=@ BS @=@ Bahamas +dnet:countries @=@ dnet:countries @=@ BH @=@ Bahrain +dnet:countries @=@ dnet:countries @=@ BD @=@ Bangladesh +dnet:countries @=@ dnet:countries @=@ BB @=@ Barbados +dnet:countries @=@ dnet:countries @=@ BY @=@ Belarus +dnet:countries @=@ dnet:countries @=@ BE @=@ Belgium +dnet:countries @=@ dnet:countries @=@ BZ @=@ Belize +dnet:countries @=@ dnet:countries @=@ BJ @=@ Benin +dnet:countries @=@ dnet:countries @=@ BM @=@ Bermuda +dnet:countries @=@ dnet:countries @=@ BT @=@ Bhutan +dnet:countries @=@ dnet:countries @=@ BO @=@ Bolivia +dnet:countries @=@ dnet:countries @=@ BQ @=@ Bonaire, Sint Eustatius and Saba +dnet:countries @=@ dnet:countries @=@ BA @=@ Bosnia and Herzegovina +dnet:countries @=@ dnet:countries @=@ BW @=@ Botswana +dnet:countries @=@ dnet:countries @=@ BV @=@ Bouvet Island +dnet:countries @=@ dnet:countries @=@ BR @=@ Brazil +dnet:countries @=@ dnet:countries @=@ IO @=@ British Indian Ocean Territory +dnet:countries @=@ dnet:countries @=@ BN @=@ Brunei Darussalam +dnet:countries @=@ dnet:countries @=@ BG @=@ Bulgaria +dnet:countries @=@ dnet:countries @=@ BF @=@ Burkina Faso +dnet:countries @=@ dnet:countries @=@ BI @=@ Burundi +dnet:countries @=@ dnet:countries @=@ KH @=@ Cambodia +dnet:countries @=@ dnet:countries @=@ CM @=@ Cameroon +dnet:countries @=@ dnet:countries @=@ CA @=@ Canada +dnet:countries @=@ dnet:countries @=@ CV @=@ Cape Verde +dnet:countries @=@ dnet:countries @=@ KY @=@ Cayman Islands +dnet:countries @=@ dnet:countries @=@ CF @=@ Central African Republic +dnet:countries @=@ dnet:countries @=@ TD @=@ Chad +dnet:countries @=@ dnet:countries @=@ CL @=@ Chile +dnet:countries @=@ dnet:countries @=@ CN @=@ China (People's Republic of) +dnet:countries @=@ dnet:countries @=@ CX @=@ Christmas Island +dnet:countries @=@ dnet:countries @=@ CC @=@ Cocos (Keeling) Islands +dnet:countries @=@ dnet:countries @=@ CO @=@ Colombia +dnet:countries @=@ dnet:countries @=@ KM @=@ Comoros +dnet:countries @=@ dnet:countries @=@ CG @=@ Congo +dnet:countries @=@ dnet:countries @=@ CD @=@ Congo (Democratic Republic of) +dnet:countries @=@ dnet:countries @=@ CK @=@ Cook Islands +dnet:countries @=@ dnet:countries @=@ CR @=@ Costa Rica +dnet:countries @=@ dnet:countries @=@ CI @=@ Cote d'Ivoire +dnet:countries @=@ dnet:countries @=@ HR @=@ Croatia +dnet:countries @=@ dnet:countries @=@ CU @=@ Cuba +dnet:countries @=@ dnet:countries @=@ CW @=@ Curaçao +dnet:countries @=@ dnet:countries @=@ CY @=@ Cyprus +dnet:countries @=@ dnet:countries @=@ CZ @=@ Czech Republic +dnet:countries @=@ dnet:countries @=@ DK @=@ Denmark +dnet:countries @=@ dnet:countries @=@ DJ @=@ Djibouti +dnet:countries @=@ dnet:countries @=@ DM @=@ Dominica +dnet:countries @=@ dnet:countries @=@ DO @=@ Dominican Republic +dnet:countries @=@ dnet:countries @=@ EC @=@ Ecuador +dnet:countries @=@ dnet:countries @=@ EG @=@ Egypt +dnet:countries @=@ dnet:countries @=@ SV @=@ El Salvador +dnet:countries @=@ dnet:countries @=@ GQ @=@ Equatorial Guinea +dnet:countries @=@ dnet:countries @=@ ER @=@ Eritrea +dnet:countries @=@ dnet:countries @=@ EE @=@ Estonia +dnet:countries @=@ dnet:countries @=@ ET @=@ Ethiopia +dnet:countries @=@ dnet:countries @=@ EU @=@ European Union +dnet:countries @=@ dnet:countries @=@ FK @=@ Falkland Islands (Malvinas) +dnet:countries @=@ dnet:countries @=@ FO @=@ Faroe Islands +dnet:countries @=@ dnet:countries @=@ FJ @=@ Fiji +dnet:countries @=@ dnet:countries @=@ FI @=@ Finland +dnet:countries @=@ dnet:countries @=@ MK @=@ Former Yugoslav Republic of Macedonia +dnet:countries @=@ dnet:countries @=@ FR @=@ France +dnet:countries @=@ dnet:countries @=@ GF @=@ French Guiana +dnet:countries @=@ dnet:countries @=@ PF @=@ French Polynesia +dnet:countries @=@ dnet:countries @=@ TF @=@ French Southern Territories +dnet:countries @=@ dnet:countries @=@ GA @=@ Gabon +dnet:countries @=@ dnet:countries @=@ GM @=@ Gambia +dnet:countries @=@ dnet:countries @=@ GE @=@ Georgia +dnet:countries @=@ dnet:countries @=@ DE @=@ Germany +dnet:countries @=@ dnet:countries @=@ GH @=@ Ghana +dnet:countries @=@ dnet:countries @=@ GI @=@ Gibraltar +dnet:countries @=@ dnet:countries @=@ GR @=@ Greece +dnet:countries @=@ dnet:countries @=@ GL @=@ Greenland +dnet:countries @=@ dnet:countries @=@ GD @=@ Grenada +dnet:countries @=@ dnet:countries @=@ GP @=@ Guadeloupe +dnet:countries @=@ dnet:countries @=@ GU @=@ Guam +dnet:countries @=@ dnet:countries @=@ GT @=@ Guatemala +dnet:countries @=@ dnet:countries @=@ GG @=@ Guernsey +dnet:countries @=@ dnet:countries @=@ GN @=@ Guinea +dnet:countries @=@ dnet:countries @=@ GW @=@ Guinea-Bissau +dnet:countries @=@ dnet:countries @=@ GY @=@ Guyana +dnet:countries @=@ dnet:countries @=@ HT @=@ Haiti +dnet:countries @=@ dnet:countries @=@ HM @=@ Heard Island and McDonald Islands +dnet:countries @=@ dnet:countries @=@ VA @=@ Holy See (Vatican City State) +dnet:countries @=@ dnet:countries @=@ HN @=@ Honduras +dnet:countries @=@ dnet:countries @=@ HK @=@ Hong Kong +dnet:countries @=@ dnet:countries @=@ HU @=@ Hungary +dnet:countries @=@ dnet:countries @=@ IS @=@ Iceland +dnet:countries @=@ dnet:countries @=@ IN @=@ India +dnet:countries @=@ dnet:countries @=@ ID @=@ Indonesia +dnet:countries @=@ dnet:countries @=@ IR @=@ Iran (Islamic Republic of) +dnet:countries @=@ dnet:countries @=@ IQ @=@ Iraq +dnet:countries @=@ dnet:countries @=@ IE @=@ Ireland +dnet:countries @=@ dnet:countries @=@ IM @=@ Isle of Man +dnet:countries @=@ dnet:countries @=@ IL @=@ Israel +dnet:countries @=@ dnet:countries @=@ IT @=@ Italy +dnet:countries @=@ dnet:countries @=@ JM @=@ Jamaica +dnet:countries @=@ dnet:countries @=@ JP @=@ Japan +dnet:countries @=@ dnet:countries @=@ JE @=@ Jersey +dnet:countries @=@ dnet:countries @=@ JO @=@ Jordan +dnet:countries @=@ dnet:countries @=@ KZ @=@ Kazakhstan +dnet:countries @=@ dnet:countries @=@ KE @=@ Kenya +dnet:countries @=@ dnet:countries @=@ KI @=@ Kiribati +dnet:countries @=@ dnet:countries @=@ KR @=@ Korea (Republic of) +dnet:countries @=@ dnet:countries @=@ KP @=@ Korea, Democatric People's Republic of +dnet:countries @=@ dnet:countries @=@ XK @=@ Kosovo * UN resolution +dnet:countries @=@ dnet:countries @=@ KW @=@ Kuwait +dnet:countries @=@ dnet:countries @=@ KG @=@ Kyrgyzstan +dnet:countries @=@ dnet:countries @=@ LA @=@ Lao (People's Democratic Republic) +dnet:countries @=@ dnet:countries @=@ LV @=@ Latvia +dnet:countries @=@ dnet:countries @=@ LB @=@ Lebanon +dnet:countries @=@ dnet:countries @=@ LS @=@ Lesotho +dnet:countries @=@ dnet:countries @=@ LR @=@ Liberia +dnet:countries @=@ dnet:countries @=@ LY @=@ Libyan Arab Jamahiriya +dnet:countries @=@ dnet:countries @=@ LI @=@ Liechtenstein +dnet:countries @=@ dnet:countries @=@ LT @=@ Lithuania +dnet:countries @=@ dnet:countries @=@ LU @=@ Luxembourg +dnet:countries @=@ dnet:countries @=@ MO @=@ Macao +dnet:countries @=@ dnet:countries @=@ MG @=@ Madagascar +dnet:countries @=@ dnet:countries @=@ MW @=@ Malawi +dnet:countries @=@ dnet:countries @=@ MY @=@ Malaysia +dnet:countries @=@ dnet:countries @=@ MV @=@ Maldives +dnet:countries @=@ dnet:countries @=@ ML @=@ Mali +dnet:countries @=@ dnet:countries @=@ MT @=@ Malta +dnet:countries @=@ dnet:countries @=@ MH @=@ Marshall Islands +dnet:countries @=@ dnet:countries @=@ MQ @=@ Martinique +dnet:countries @=@ dnet:countries @=@ MR @=@ Mauritania +dnet:countries @=@ dnet:countries @=@ MU @=@ Mauritius +dnet:countries @=@ dnet:countries @=@ YT @=@ Mayotte +dnet:countries @=@ dnet:countries @=@ MX @=@ Mexico +dnet:countries @=@ dnet:countries @=@ FM @=@ Micronesia, Federated States of +dnet:countries @=@ dnet:countries @=@ MD @=@ Moldova (Republic of) +dnet:countries @=@ dnet:countries @=@ MN @=@ Mongolia +dnet:countries @=@ dnet:countries @=@ ME @=@ Montenegro +dnet:countries @=@ dnet:countries @=@ MS @=@ Montserrat +dnet:countries @=@ dnet:countries @=@ MA @=@ Morocco +dnet:countries @=@ dnet:countries @=@ MZ @=@ Mozambique +dnet:countries @=@ dnet:countries @=@ MM @=@ Myanmar +dnet:countries @=@ dnet:countries @=@ NA @=@ Namibia +dnet:countries @=@ dnet:countries @=@ NR @=@ Nauru +dnet:countries @=@ dnet:countries @=@ NP @=@ Nepal +dnet:countries @=@ dnet:countries @=@ NL @=@ Netherlands +dnet:countries @=@ dnet:countries @=@ AN @=@ Netherlands Antilles +dnet:countries @=@ dnet:countries @=@ NC @=@ New Caledonia +dnet:countries @=@ dnet:countries @=@ NZ @=@ New Zealand +dnet:countries @=@ dnet:countries @=@ NI @=@ Nicaragua +dnet:countries @=@ dnet:countries @=@ NE @=@ Niger +dnet:countries @=@ dnet:countries @=@ NG @=@ Nigeria +dnet:countries @=@ dnet:countries @=@ NU @=@ Niue +dnet:countries @=@ dnet:countries @=@ NF @=@ Norfolk Island +dnet:countries @=@ dnet:countries @=@ MP @=@ Northern Mariana Islands +dnet:countries @=@ dnet:countries @=@ NO @=@ Norway +dnet:countries @=@ dnet:countries @=@ OC @=@ Oceania +dnet:countries @=@ dnet:countries @=@ OM @=@ Oman +dnet:countries @=@ dnet:countries @=@ PK @=@ Pakistan +dnet:countries @=@ dnet:countries @=@ PW @=@ Palau +dnet:countries @=@ dnet:countries @=@ PS @=@ Palestinian-administered areas +dnet:countries @=@ dnet:countries @=@ PA @=@ Panama +dnet:countries @=@ dnet:countries @=@ PG @=@ Papua New Guinea +dnet:countries @=@ dnet:countries @=@ PY @=@ Paraguay +dnet:countries @=@ dnet:countries @=@ PE @=@ Peru +dnet:countries @=@ dnet:countries @=@ PH @=@ Philippines +dnet:countries @=@ dnet:countries @=@ PN @=@ Pitcairn +dnet:countries @=@ dnet:countries @=@ PL @=@ Poland +dnet:countries @=@ dnet:countries @=@ PT @=@ Portugal +dnet:countries @=@ dnet:countries @=@ PR @=@ Puerto Rico +dnet:countries @=@ dnet:countries @=@ QA @=@ Qatar +dnet:countries @=@ dnet:countries @=@ RO @=@ Romania +dnet:countries @=@ dnet:countries @=@ RU @=@ Russian Federation +dnet:countries @=@ dnet:countries @=@ RW @=@ Rwanda +dnet:countries @=@ dnet:countries @=@ RE @=@ Réunion +dnet:countries @=@ dnet:countries @=@ SH @=@ Saint Helena, Ascension and Tristan da Cunha +dnet:countries @=@ dnet:countries @=@ KN @=@ Saint Kitts and Nevis +dnet:countries @=@ dnet:countries @=@ LC @=@ Saint Lucia +dnet:countries @=@ dnet:countries @=@ MF @=@ Saint Martin (French Part) +dnet:countries @=@ dnet:countries @=@ PM @=@ Saint Pierre and Miquelon +dnet:countries @=@ dnet:countries @=@ VC @=@ Saint Vincent and the Grenadines +dnet:countries @=@ dnet:countries @=@ BL @=@ Saint-Barthélemy +dnet:countries @=@ dnet:countries @=@ WS @=@ Samoa +dnet:countries @=@ dnet:countries @=@ SM @=@ San Marino +dnet:countries @=@ dnet:countries @=@ SA @=@ Saudi Arabia +dnet:countries @=@ dnet:countries @=@ SN @=@ Senegal +dnet:countries @=@ dnet:countries @=@ RS @=@ Serbia +dnet:countries @=@ dnet:countries @=@ CS @=@ Serbia and Montenegro +dnet:countries @=@ dnet:countries @=@ SC @=@ Seychelles +dnet:countries @=@ dnet:countries @=@ SL @=@ Sierra Leone +dnet:countries @=@ dnet:countries @=@ SG @=@ Singapore +dnet:countries @=@ dnet:countries @=@ SX @=@ Sint Maarten (Dutch Part) +dnet:countries @=@ dnet:countries @=@ SK @=@ Slovakia +dnet:countries @=@ dnet:countries @=@ SI @=@ Slovenia +dnet:countries @=@ dnet:countries @=@ SB @=@ Solomon Islands +dnet:countries @=@ dnet:countries @=@ SO @=@ Somalia +dnet:countries @=@ dnet:countries @=@ ZA @=@ South Africa +dnet:countries @=@ dnet:countries @=@ GS @=@ South Georgia and the South Sandwich Islands +dnet:countries @=@ dnet:countries @=@ SS @=@ South Sudan +dnet:countries @=@ dnet:countries @=@ ES @=@ Spain +dnet:countries @=@ dnet:countries @=@ LK @=@ Sri Lanka +dnet:countries @=@ dnet:countries @=@ SD @=@ Sudan +dnet:countries @=@ dnet:countries @=@ SR @=@ Suriname +dnet:countries @=@ dnet:countries @=@ SJ @=@ Svalbard and Jan Mayen +dnet:countries @=@ dnet:countries @=@ SZ @=@ Swaziland +dnet:countries @=@ dnet:countries @=@ SE @=@ Sweden +dnet:countries @=@ dnet:countries @=@ CH @=@ Switzerland +dnet:countries @=@ dnet:countries @=@ SY @=@ Syrian Arab Republic +dnet:countries @=@ dnet:countries @=@ ST @=@ São Tomé and Príncipe +dnet:countries @=@ dnet:countries @=@ TW @=@ Taiwan +dnet:countries @=@ dnet:countries @=@ TJ @=@ Tajikistan +dnet:countries @=@ dnet:countries @=@ TZ @=@ Tanzania (United Republic of) +dnet:countries @=@ dnet:countries @=@ TH @=@ Thailand +dnet:countries @=@ dnet:countries @=@ TL @=@ Timor-Leste +dnet:countries @=@ dnet:countries @=@ TG @=@ Togo +dnet:countries @=@ dnet:countries @=@ TK @=@ Tokelau +dnet:countries @=@ dnet:countries @=@ TO @=@ Tonga +dnet:countries @=@ dnet:countries @=@ TT @=@ Trinidad and Tobago +dnet:countries @=@ dnet:countries @=@ TN @=@ Tunisia +dnet:countries @=@ dnet:countries @=@ TR @=@ Turkey +dnet:countries @=@ dnet:countries @=@ TM @=@ Turkmenistan +dnet:countries @=@ dnet:countries @=@ TC @=@ Turks and Caicos Islands +dnet:countries @=@ dnet:countries @=@ TV @=@ Tuvalu +dnet:countries @=@ dnet:countries @=@ UNKNOWN @=@ UNKNOWN +dnet:countries @=@ dnet:countries @=@ UG @=@ Uganda +dnet:countries @=@ dnet:countries @=@ UA @=@ Ukraine +dnet:countries @=@ dnet:countries @=@ AE @=@ United Arab Emirates +dnet:countries @=@ dnet:countries @=@ GB @=@ United Kingdom +dnet:countries @=@ dnet:countries @=@ US @=@ United States +dnet:countries @=@ dnet:countries @=@ UM @=@ United States Minor Outlying Islands +dnet:countries @=@ dnet:countries @=@ UY @=@ Uruguay +dnet:countries @=@ dnet:countries @=@ UZ @=@ Uzbekistan +dnet:countries @=@ dnet:countries @=@ VU @=@ Vanuatu +dnet:countries @=@ dnet:countries @=@ VE @=@ Venezuela +dnet:countries @=@ dnet:countries @=@ VN @=@ Viet Nam +dnet:countries @=@ dnet:countries @=@ VG @=@ Virgin Islands (British) +dnet:countries @=@ dnet:countries @=@ VI @=@ Virgin Islands, U.S. +dnet:countries @=@ dnet:countries @=@ WF @=@ Wallis and Futuna +dnet:countries @=@ dnet:countries @=@ EH @=@ Western Sahara +dnet:countries @=@ dnet:countries @=@ YE @=@ Yemen +dnet:countries @=@ dnet:countries @=@ YU @=@ Yugoslavia +dnet:countries @=@ dnet:countries @=@ ZM @=@ Zambia +dnet:countries @=@ dnet:countries @=@ ZW @=@ Zimbabwe +dnet:countries @=@ dnet:countries @=@ AX @=@ Åland Islands +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ openaire2.0 @=@ OpenAIRE 2.0 (EC funding) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ driver-openaire2.0 @=@ OpenAIRE 2.0+ (DRIVER OA, EC funding) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ openaire3.0 @=@ OpenAIRE 3.0 (OA, funding) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ openaire4.0 @=@ OpenAIRE 4.0 (inst.&thematic. repo.) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ driver @=@ OpenAIRE Basic (DRIVER OA) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ openaire2.0_data @=@ OpenAIRE Data (funded, referenced datasets) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ hostedBy @=@ collected from a compatible aggregator +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ UNKNOWN @=@ not available +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ native @=@ proprietary +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ notCompatible @=@ under validation +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ openaire-cris_1.1 @=@ OpenAIRE CRIS v1.1 +fct:funding_relations @=@ fct:funding_relations @=@ fct:hasParentFunding @=@ fct:hasParentFunding +dnet:protocols @=@ dnet:protocols @=@ HTTPWithFileName @=@ HTTPWithFileName +dnet:protocols @=@ dnet:protocols @=@ NetCDF @=@ NetCDF +dnet:protocols @=@ dnet:protocols @=@ OpenDAP @=@ OpenDAP +dnet:protocols @=@ dnet:protocols @=@ schemaorg @=@ Schema.org +dnet:protocols @=@ dnet:protocols @=@ UNKNOWN @=@ UNKNOWN +dnet:protocols @=@ dnet:protocols @=@ api @=@ api +dnet:protocols @=@ dnet:protocols @=@ dataciteESPlugins @=@ dataciteESPlugins +dnet:protocols @=@ dnet:protocols @=@ datasetsbyjournal @=@ datasetsbyjournal +dnet:protocols @=@ dnet:protocols @=@ datasetsbyproject @=@ datasetsbyproject +dnet:protocols @=@ dnet:protocols @=@ excelFile @=@ excelFile +dnet:protocols @=@ dnet:protocols @=@ file @=@ file +dnet:protocols @=@ dnet:protocols @=@ fileGzip @=@ fileGzip +dnet:protocols @=@ dnet:protocols @=@ files_by_rpc @=@ files_by_rpc +dnet:protocols @=@ dnet:protocols @=@ files_from_mdstore @=@ files_from_mdstore +dnet:protocols @=@ dnet:protocols @=@ files_from_metadata @=@ files_from_metadata +dnet:protocols @=@ dnet:protocols @=@ filesystem @=@ filesystem +dnet:protocols @=@ dnet:protocols @=@ ftp @=@ ftp +dnet:protocols @=@ dnet:protocols @=@ gristProjects @=@ gristProjects +dnet:protocols @=@ dnet:protocols @=@ gtr2Projects @=@ gtr2Projects +dnet:protocols @=@ dnet:protocols @=@ http @=@ http +dnet:protocols @=@ dnet:protocols @=@ httpCSV @=@ httpCSV +dnet:protocols @=@ dnet:protocols @=@ httpList @=@ httpList +dnet:protocols @=@ dnet:protocols @=@ jdbc @=@ jdbc +dnet:protocols @=@ dnet:protocols @=@ oai @=@ oai +dnet:protocols @=@ dnet:protocols @=@ oai_sets @=@ oai_sets +dnet:protocols @=@ dnet:protocols @=@ other @=@ other +dnet:protocols @=@ dnet:protocols @=@ re3data @=@ re3data +dnet:protocols @=@ dnet:protocols @=@ rest @=@ rest +dnet:protocols @=@ dnet:protocols @=@ rest_json2xml @=@ rest_json2xml +dnet:protocols @=@ dnet:protocols @=@ sftp @=@ sftp +dnet:protocols @=@ dnet:protocols @=@ soap @=@ soap +dnet:protocols @=@ dnet:protocols @=@ sparql @=@ sparql +dnet:protocols @=@ dnet:protocols @=@ sword @=@ sword +dnet:protocols @=@ dnet:protocols @=@ targz @=@ targz +dnet:protocols @=@ dnet:protocols @=@ remoteMdstore @=@ remoteMdstore +wt:funding_typologies @=@ Wellcome Trust: Funding Typologies @=@ wt:fundingStream @=@ Wellcome Trust: Funding Stream +dnet:externalReference_typologies @=@ dnet:externalReference_typologies @=@ accessionNumber @=@ accessionNumber +dnet:externalReference_typologies @=@ dnet:externalReference_typologies @=@ dataset @=@ dataset +dnet:externalReference_typologies @=@ dnet:externalReference_typologies @=@ software @=@ software +datacite:id_typologies @=@ datacite:id_typologies @=@ ARK @=@ ARK +datacite:id_typologies @=@ datacite:id_typologies @=@ DOI @=@ DOI +datacite:id_typologies @=@ datacite:id_typologies @=@ EAN13 @=@ EAN13 +datacite:id_typologies @=@ datacite:id_typologies @=@ EISSN @=@ EISSN +datacite:id_typologies @=@ datacite:id_typologies @=@ Handle @=@ Handle +datacite:id_typologies @=@ datacite:id_typologies @=@ ISBN @=@ ISBN +datacite:id_typologies @=@ datacite:id_typologies @=@ ISSN @=@ ISSN +datacite:id_typologies @=@ datacite:id_typologies @=@ ISTC @=@ ISTC +datacite:id_typologies @=@ datacite:id_typologies @=@ LISSN @=@ LISSN +datacite:id_typologies @=@ datacite:id_typologies @=@ LSID @=@ LSID +datacite:id_typologies @=@ datacite:id_typologies @=@ PURL @=@ PURL +datacite:id_typologies @=@ datacite:id_typologies @=@ UNKNOWN @=@ UNKNOWN +datacite:id_typologies @=@ datacite:id_typologies @=@ UPC @=@ UPC +datacite:id_typologies @=@ datacite:id_typologies @=@ URL @=@ URL +datacite:id_typologies @=@ datacite:id_typologies @=@ URN @=@ URN +dnet:pid_types @=@ dnet:pid_types @=@ actrn @=@ ACTRN Identifier +dnet:pid_types @=@ dnet:pid_types @=@ nct @=@ ClinicalTrials.gov Identifier +dnet:pid_types @=@ dnet:pid_types @=@ euctr @=@ EU Clinical Trials Register +dnet:pid_types @=@ dnet:pid_types @=@ epo_id @=@ European Patent Office application ID +dnet:pid_types @=@ dnet:pid_types @=@ gsk @=@ GSK Identifier +dnet:pid_types @=@ dnet:pid_types @=@ GeoPass @=@ Geographic Location-Password Scheme +dnet:pid_types @=@ dnet:pid_types @=@ GBIF @=@ Global Biodiversity Information Facility +dnet:pid_types @=@ dnet:pid_types @=@ isrctn @=@ ISRCTN Identifier +dnet:pid_types @=@ dnet:pid_types @=@ ISNI @=@ International Standard Name Identifier +dnet:pid_types @=@ dnet:pid_types @=@ jprn @=@ JPRN Identifier +dnet:pid_types @=@ dnet:pid_types @=@ mag_id @=@ Microsoft Academic Graph Identifier +dnet:pid_types @=@ dnet:pid_types @=@ oai @=@ Open Archives Initiative +dnet:pid_types @=@ dnet:pid_types @=@ orcid @=@ Open Researcher and Contributor ID +dnet:pid_types @=@ dnet:pid_types @=@ PANGAEA @=@ PANGAEA +dnet:pid_types @=@ dnet:pid_types @=@ epo_nr_epodoc @=@ Patent application number in EPODOC format +dnet:pid_types @=@ dnet:pid_types @=@ UNKNOWN @=@ UNKNOWN +dnet:pid_types @=@ dnet:pid_types @=@ VIAF @=@ Virtual International Authority File +dnet:pid_types @=@ dnet:pid_types @=@ arXiv @=@ arXiv +dnet:pid_types @=@ dnet:pid_types @=@ doi @=@ doi +dnet:pid_types @=@ dnet:pid_types @=@ grid @=@ grid +dnet:pid_types @=@ dnet:pid_types @=@ info:eu-repo/dai @=@ info:eu-repo/dai +dnet:pid_types @=@ dnet:pid_types @=@ orcidworkid @=@ orcid workid +dnet:pid_types @=@ dnet:pid_types @=@ pmc @=@ pmc +dnet:pid_types @=@ dnet:pid_types @=@ pmid @=@ pmid +dnet:pid_types @=@ dnet:pid_types @=@ urn @=@ urn +dnet:pid_types @=@ dnet:pid_types @=@ who @=@ WHO Identifier +dnet:pid_types @=@ dnet:pid_types @=@ drks @=@ DRKS Identifier +dnet:pid_types @=@ dnet:pid_types @=@ handle @=@ Handle +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ACM @=@ An ACM classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ARXIV @=@ An ARXIV classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/DDC @=@ A Dewey Decimal classification term (DDC) that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/JEL @=@ A Journal of Economic Literature (JEL) classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/OPENACCESS_VERSION @=@ An Open Access versions of your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/IS_REFERENCED_BY @=@ A dataset referenced by your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/REFERENCES @=@ A dataset that refers to your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/IS_RELATED_TO @=@ A dataset related to your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_TO @=@ A dataset that supplements your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION/IS_RELATED_TO @=@ A publication related to your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION/REFERENCES @=@ A publication referenced by your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION/IS_REFERENCED_BY @=@ A publication that refers to your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_BY @=@ A publication that is supplemented by your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_TO @=@ A publication that supplements your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SOFTWARE @=@ A software referred by your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/OPENACCESS_VERSION @=@ Another Open Access version of a publication +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/PID @=@ Another persistent identifier associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/MESHEUROPMC @=@ A classification term from the Medical Subject Headings (MeSH) that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/ABSTRACT @=@ An abstract describing among your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION_DATE @=@ A date of publication missing in your content +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PID @=@ A persistent identifier associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/SUBJECT/ACM @=@ Another ACM classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/SUBJECT/ARXIV @=@ Another ARXIV classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/SUBJECT/DDC @=@ Another Dewey Decimal classification term (DDC) that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/SUBJECT/JEL @=@ Another Journal of Economic Literature (JEL) classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/SUBJECT/MESHEUROPMC @=@ Another classification term from the Medical Subject Headings (MeSH) that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PROJECT @=@ A project reference that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_BY @=@ A dataset that is supplemented by your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/AUTHOR/ORCID @=@ An Open Researcher and Contributor ID (ORCID) that can be associated to an author of your publications +dnet:review_levels @=@ dnet:review_levels @=@ 0000 @=@ Unknown +dnet:review_levels @=@ dnet:review_levels @=@ 0002 @=@ nonPeerReviewed +dnet:review_levels @=@ dnet:review_levels @=@ 0001 @=@ peerReviewed \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml index a9eae8576..77fccb4d3 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml @@ -16,7 +16,7 @@ diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt new file mode 100644 index 000000000..9a02c9071 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt @@ -0,0 +1,445 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + OPEN + + + + + CLOSED + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index 32c58595f..f2c63cee6 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -5,6 +5,7 @@ import eu.dnetlib.dhp.schema.oaf.{AccessRight, DataInfo, Dataset, Field, Instanc import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.lang3.StringUtils import com.fasterxml.jackson.databind.ObjectMapper +import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.scholexplorer.OafUtils import org.json4s import org.json4s.DefaultFormats @@ -107,23 +108,17 @@ object DoiBoostMappingUtil { def fixResult(result: Dataset) :Dataset = { - val instanceType = result.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty) + val instanceType = extractInstance(result) if (instanceType.isDefined) { result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype)) } result.getInstance().asScala.foreach(i => { - i.setHostedby(getUbknownHostedBy()) + i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY) }) result } - def getUbknownHostedBy():KeyValue = { - val hb = new KeyValue - hb.setValue("Unknown Repository") - hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c") - hb - } def getOpenAccessQualifier():AccessRight = { @@ -135,6 +130,11 @@ object DoiBoostMappingUtil { } + + def extractInstance(r:Result):Option[Instance] = { + r.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty) + } + def fixPublication(input:((String,Publication), (String,HostedByItemType))): Publication = { val publication = input._1._2 @@ -142,7 +142,7 @@ object DoiBoostMappingUtil { val item = if (input._2 != null) input._2._2 else null - val instanceType = publication.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty) + val instanceType:Option[Instance] = extractInstance(publication) if (instanceType.isDefined) { publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype)) @@ -150,28 +150,30 @@ object DoiBoostMappingUtil { publication.getInstance().asScala.foreach(i => { - val hb = new KeyValue + var hb = new KeyValue if (item != null) { hb.setValue(item.officialname) hb.setKey(generateDSId(item.id)) if (item.openAccess) i.setAccessright(getOpenAccessQualifier()) - publication.setBestaccessright(getOpenAccessQualifier()) + val ar = getOpenAccessQualifier() + publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) } else { - hb.setValue("Unknown Repository") - hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c") + hb = ModelConstants.UNKNOWN_REPOSITORY } i.setHostedby(hb) }) val ar = publication.getInstance().asScala.filter(i => i.getInstancetype != null && i.getAccessright!= null && i.getAccessright.getClassid!= null).map(f=> f.getAccessright.getClassid) if (ar.nonEmpty) { - if(ar.contains("OPEN")){ - publication.setBestaccessright(getOpenAccessQualifier()) + if(ar.contains(ModelConstants.ACCESS_RIGHT_OPEN)){ + val ar = getOpenAccessQualifier() + publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) } else { - publication.setBestaccessright(getRestrictedQualifier()) + val ar = getRestrictedQualifier() + publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) } } publication diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala index 78477ae4d..3bfca0859 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala @@ -38,37 +38,39 @@ object SparkGenerateDOIBoostActionSet { val crossRefRelation = parser.get("crossRefRelation") val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath") val dbOrganizationPath = parser.get("dbOrganizationPath") - val workingDirPath = parser.get("targetPath") val sequenceFilePath = parser.get("sFilePath") val asDataset = spark.read.load(dbDatasetPath).as[OafDataset] + .filter(p => p != null || p.getId != null) .map(d =>DoiBoostMappingUtil.fixResult(d)) .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) -// .write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet") + val asPublication =spark.read.load(dbPublicationPath).as[Publication] + .filter(p => p != null || p.getId != null) .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) -// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") + val asOrganization = spark.read.load(dbOrganizationPath).as[Organization] .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) -// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") + val asCRelation = spark.read.load(crossRefRelation).as[Relation] + .filter(r => r!= null && r.getSource != null && r.getTarget != null) .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) -// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") + val asRelAffiliation = spark.read.load(dbaffiliationRelationPath).as[Relation] .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) -// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") + val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation) -// spark.read.load(s"$workingDirPath/actionSet").as[(String,String)] + d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec]) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 43b3f7e1c..b051177f5 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -15,7 +15,7 @@ import scala.collection.JavaConverters._ import scala.collection.mutable import scala.util.matching.Regex -import eu.dnetlib.dhp.schema.scholexplorer.OafUtils; +import eu.dnetlib.dhp.schema.scholexplorer.OafUtils case class CrossrefDT(doi: String, json:String, timestamp: Long) {} @@ -182,7 +182,7 @@ case object Crossref2Oaf { // Ticket #6281 added pid to Instance instance.setPid(result.getPid) - val has_review = (json \ "relation" \"has-review" \ "id") + val has_review = json \ "relation" \"has-review" \ "id" if(has_review != JNothing) { instance.setRefereed( @@ -208,8 +208,9 @@ case object Crossref2Oaf { instance.setUrl(links.asJava) result.setId(IdentifierFactory.createDOIBoostIdentifier(result)) if (result.getId== null) - return null - result + null + else + result } @@ -241,9 +242,9 @@ case object Crossref2Oaf { val result = generateItemFromType(objectType, objectSubType) if (result == null) return List() - val cOBJCategory = mappingCrossrefSubType.getOrElse(objectType, mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type")); + val cOBJCategory = mappingCrossrefSubType.getOrElse(objectType, mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type")) mappingResult(result, json, cOBJCategory) - if (result == null) + if (result == null || result.getId == null) return List() diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala index 4568e23a5..cc112528e 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala @@ -59,6 +59,17 @@ class CrossrefMappingTest { } + @Test + def testSum() :Unit = { + val from:Long = 1613135645000L + val delta:Long = 1000000L + + + println(s"updating from value: $from -> ${from+delta}") + + + } + @Test def testOrcidID() :Unit = { val json = Source.fromInputStream(getClass.getResourceAsStream("orcid_data.json")).mkString diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index 3e1d84c01..9a5d5033b 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -33,6 +33,10 @@ + + -Xmax-classfile-name + 140 + ${scala.version} @@ -67,7 +71,7 @@ test - org.apache.httpcomponents + org.apache.httpcomponents httpclient diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index 3067d8639..86c453656 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -22,7 +22,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.utils.ISLookupClientFactory; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java index 08695ec67..f06059dd5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java @@ -7,7 +7,7 @@ import java.util.HashMap; import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.AccessRight; import eu.dnetlib.dhp.schema.oaf.Country; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 3d75c426d..77d47b285 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -15,7 +15,7 @@ import org.dom4j.Node; import com.google.common.collect.Lists; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index 40020427a..48c0df334 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -3,7 +3,6 @@ package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.Objects; @@ -12,8 +11,6 @@ import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -27,7 +24,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.utils.ISLookupClientFactory; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index 823fd83d3..299ff7d78 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -55,10 +55,10 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.DbClient; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; import eu.dnetlib.dhp.oa.graph.raw.common.MigrateAction; import eu.dnetlib.dhp.oa.graph.raw.common.VerifyNsPrefixPredicate; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Context; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Dataset; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java index e7703bf72..50042b569 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java @@ -11,8 +11,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.MdstoreClient; import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; -import eu.dnetlib.dhp.oa.graph.raw.common.MdstoreClient; public class MigrateMongoMdstoresApplication extends AbstractMigrationApplication implements Closeable { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 22bd6718a..ea9394890 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -18,7 +18,7 @@ import org.dom4j.Node; import com.google.common.collect.Lists; import eu.dnetlib.dhp.common.PacePerson; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index d4997cd2b..21dc5eff6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -13,7 +13,7 @@ import org.dom4j.Document; import org.dom4j.Node; import eu.dnetlib.dhp.common.PacePerson; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java index e38101f82..fdbc58c17 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java @@ -18,7 +18,8 @@ import org.mockito.junit.jupiter.MockitoExtension; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @@ -78,7 +79,7 @@ public class CleaningFunctionTest { assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid()); assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname()); - Set pidTerms = vocabularies.getTerms("dnet:pid_types"); + Set pidTerms = vocabularies.getTerms(ModelConstants.DNET_PID_TYPES); assertTrue( p_out .getPid() @@ -86,11 +87,67 @@ public class CleaningFunctionTest { .map(p -> p.getQualifier()) .allMatch(q -> pidTerms.contains(q.getClassid()))); - Publication p_defaults = CleaningFunctions.cleanup(p_out); - assertEquals("CLOSED", p_defaults.getBestaccessright().getClassid()); + List poi = p_out.getInstance(); + assertNotNull(poi); + assertEquals(1, poi.size()); + + final Instance poii = poi.get(0); + assertNotNull(poii); + assertNotNull(poii.getPid()); + + assertEquals(2, poii.getPid().size()); + + assertTrue( + poii.getPid().stream().filter(s -> s.getValue().equals("10.1007/s109090161569x")).findFirst().isPresent()); + assertTrue(poii.getPid().stream().filter(s -> s.getValue().equals("10.1008/abcd")).findFirst().isPresent()); + + assertNotNull(poii.getAlternateIdentifier()); + assertEquals(2, poii.getAlternateIdentifier().size()); + + assertTrue( + poii + .getAlternateIdentifier() + .stream() + .filter(s -> s.getValue().equals("10.1007/s109090161569x")) + .findFirst() + .isPresent()); + assertTrue( + poii + .getAlternateIdentifier() + .stream() + .filter(s -> s.getValue().equals("10.1009/qwerty")) + .findFirst() + .isPresent()); + + Publication p_cleaned = CleaningFunctions.cleanup(p_out); + assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid()); assertNull(p_out.getPublisher()); - getAuthorPids(p_defaults).forEach(pid -> { + final List pci = p_cleaned.getInstance(); + assertNotNull(pci); + assertEquals(1, pci.size()); + + final Instance pcii = pci.get(0); + assertNotNull(pcii); + assertNotNull(pcii.getPid()); + + assertEquals(2, pcii.getPid().size()); + + assertTrue( + pcii.getPid().stream().filter(s -> s.getValue().equals("10.1007/s109090161569x")).findFirst().isPresent()); + assertTrue(pcii.getPid().stream().filter(s -> s.getValue().equals("10.1008/abcd")).findFirst().isPresent()); + + assertNotNull(pcii.getAlternateIdentifier()); + assertEquals(1, pcii.getAlternateIdentifier().size()); + assertTrue( + pcii + .getAlternateIdentifier() + .stream() + .filter(s -> s.getValue().equals("10.1009/qwerty")) + .findFirst() + .isPresent()); + + getAuthorPids(p_cleaned).forEach(pid -> { System.out .println( String @@ -100,7 +157,7 @@ public class CleaningFunctionTest { }); // TODO add more assertions to verity the cleaned values - System.out.println(MAPPER.writeValueAsString(p_out)); + System.out.println(MAPPER.writeValueAsString(p_cleaned)); /* * assertTrue( p_out .getPid() .stream() .allMatch(sp -> StringUtils.isNotBlank(sp.getValue()))); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java index 4c1b0a739..2e46f0f50 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java @@ -15,8 +15,8 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 7500afd5a..c86e31280 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -21,8 +21,8 @@ import org.mockito.junit.jupiter.MockitoExtension; import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Dataset; @@ -32,6 +32,7 @@ import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.utils.PidType; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @ExtendWith(MockitoExtension.class) @@ -396,11 +397,10 @@ public class MappersTest { assertEquals(1, d.getAuthor().size()); assertEquals(1, d.getSubject().size()); assertEquals(1, d.getInstance().size()); - assertTrue(d.getPid().isEmpty()); - - assertTrue(d.getInstance().get(0).getPid().isEmpty()); - assertEquals(1, d.getInstance().get(0).getAlternateIdentifier().size()); - assertEquals("handle", d.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid()); + assertNotNull(d.getPid()); + assertEquals(1, d.getPid().size()); + assertTrue(PidType.isValid(d.getPid().get(0).getQualifier().getClassid())); + assertEquals(PidType.handle, PidType.valueOf(d.getPid().get(0).getQualifier().getClassid())); assertNotNull(d.getInstance().get(0).getUrl()); } @@ -451,7 +451,10 @@ public class MappersTest { assertEquals(1, p.getAuthor().size()); assertEquals("OPEN", p.getBestaccessright().getClassid()); - assertTrue(p.getPid().isEmpty()); + assertTrue(p.getPid().size() == 1); + assertTrue(PidType.isValid(p.getPid().get(0).getQualifier().getClassid())); + assertTrue(PidType.handle.equals(PidType.valueOf(p.getPid().get(0).getQualifier().getClassid()))); + assertEquals("hdl:11858/00-1734-0000-0003-EE73-2", p.getPid().get(0).getValue()); assertEquals("dataset", p.getResulttype().getClassname()); assertEquals(1, p.getInstance().size()); assertEquals("OPEN", p.getInstance().get(0).getAccessright().getClassid()); @@ -461,11 +464,8 @@ public class MappersTest { "http://creativecommons.org/licenses/by/3.0/de/legalcode", p.getInstance().get(0).getLicense().getValue()); assertEquals(1, p.getInstance().size()); - assertEquals(1, p.getInstance().get(0).getAlternateIdentifier().size()); - assertEquals("handle", p.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid()); - assertEquals( - "hdl:11858/00-1734-0000-0003-EE73-2", p.getInstance().get(0).getAlternateIdentifier().get(0).getValue()); - + assertNotNull(p.getInstance().get(0).getAlternateIdentifier()); + assertEquals(0, p.getInstance().get(0).getAlternateIdentifier().size()); assertEquals(1, p.getInstance().get(0).getUrl().size()); } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java index 0d1ec1ad1..b38da4569 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java @@ -27,7 +27,7 @@ import org.mockito.junit.jupiter.MockitoExtension; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.*; @ExtendWith(MockitoExtension.class) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index e746d236e..23de2ef86 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -318,6 +318,50 @@ "id": "50|CSC_________::2250a70c903c6ac6e4c01438259e9375", "instance": [ { + "pid": [ + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1007/s109090161569x" + }, + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1008/abcd" + } + ], + "alternateIdentifier": [ + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1007/s109090161569x" + }, + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1009/qwerty" + } + ], "accessright": { "classid": "CLOSED", "classname": "CLOSED", diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala index 607c4fa45..cad8d88d3 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala @@ -224,7 +224,7 @@ object DLIToOAF { if (cleanedPids.isEmpty) return null result.setId(generateId(inputPublication.getId)) - result.setDataInfo(generateDataInfo(invisibile = true)) + result.setDataInfo(generateDataInfo(invisible = true)) if (inputPublication.getCollectedfrom == null || inputPublication.getCollectedfrom.size() == 0 || (inputPublication.getCollectedfrom.size() == 1 && inputPublication.getCollectedfrom.get(0) == null)) return null result.setCollectedfrom(inputPublication.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava) diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml index f92b63dfd..58b37ad0e 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml @@ -80,7 +80,6 @@ - diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh deleted file mode 100644 index 70112dc7b..000000000 --- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh +++ /dev/null @@ -1,18 +0,0 @@ -export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs -export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) -if ! [ -L $link_folder ] -then - rm -Rf "$link_folder" - ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} -fi - -echo "Getting file from " $3 -hdfs dfs -copyToLocal $3 - -echo "Running impala shell make the new database visible" -impala-shell -q "INVALIDATE METADATA;" - -echo "Running impala shell to compute new table stats" -impala-shell -d $1 -f $2 -echo "Impala shell finished" -rm $2 diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql deleted file mode 100644 index 34e48a18a..000000000 --- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql +++ /dev/null @@ -1,8 +0,0 @@ ------------------------------------------------------- ------------------------------------------------------- --- Impala table statistics - Needed to make the tables --- visible for impala ------------------------------------------------------- ------------------------------------------------------- - -INVALIDATE METADATA ${stats_db_name}; diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql deleted file mode 100644 index 48f8d58fd..000000000 --- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql +++ /dev/null @@ -1,207 +0,0 @@ ------------------------------------------------------- ------------------------------------------------------- --- Shadow schema table exchange ------------------------------------------------------- ------------------------------------------------------- - --- Dropping old views -DROP VIEW IF EXISTS ${stats_db_production_name}.category; -DROP VIEW IF EXISTS ${stats_db_production_name}.concept; -DROP VIEW IF EXISTS ${stats_db_production_name}.context; -DROP VIEW IF EXISTS ${stats_db_production_name}.country; -DROP VIEW IF EXISTS ${stats_db_production_name}.countrygdp; -DROP VIEW IF EXISTS ${stats_db_production_name}.creation_date; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_citations; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_classifications; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_concepts; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_datasources; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_languages; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_licenses; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_oids; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_pids; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_refereed; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_sources; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_topics; -DROP VIEW IF EXISTS ${stats_db_production_name}.datasource; -DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_languages; -DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_oids; -DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_organizations; -DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_results; -DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_sources; -DROP VIEW IF EXISTS ${stats_db_production_name}.funder; -DROP VIEW IF EXISTS ${stats_db_production_name}.fundref; -DROP VIEW IF EXISTS ${stats_db_production_name}.numbers_country; -DROP VIEW IF EXISTS ${stats_db_production_name}.organization; -DROP VIEW IF EXISTS ${stats_db_production_name}.organization_datasources; -DROP VIEW IF EXISTS ${stats_db_production_name}.organization_pids; -DROP VIEW IF EXISTS ${stats_db_production_name}.organization_projects; -DROP VIEW IF EXISTS ${stats_db_production_name}.organization_sources; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_citations; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_classifications; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_concepts; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_datasources; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_languages; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_licenses; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_oids; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_pids; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_refereed; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_sources; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_topics; -DROP VIEW IF EXISTS ${stats_db_production_name}.project; -DROP VIEW IF EXISTS ${stats_db_production_name}.project_oids; -DROP VIEW IF EXISTS ${stats_db_production_name}.project_organizations; -DROP VIEW IF EXISTS ${stats_db_production_name}.project_results; -DROP VIEW IF EXISTS ${stats_db_production_name}.project_resultcount; -DROP VIEW IF EXISTS ${stats_db_production_name}.project_results_publication; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_citations; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_classifications; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_concepts; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_datasources; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_languages; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_licenses; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_oids; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_pids; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_refereed; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_sources; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_topics; -DROP VIEW IF EXISTS ${stats_db_production_name}.result; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_affiliated_country; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_citations; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_classifications; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_concepts; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_datasources; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_deposited_country; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_fundercount; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_gold; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_greenoa; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_languages; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_licenses; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_oids; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_organization; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_peerreviewed; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_pids; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_projectcount; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_projects; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_refereed; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_sources; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_topics; -DROP VIEW IF EXISTS ${stats_db_production_name}.rndexpediture; -DROP VIEW IF EXISTS ${stats_db_production_name}.roarmap; -DROP VIEW IF EXISTS ${stats_db_production_name}.software; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_citations; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_classifications; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_concepts; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_datasources; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_languages; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_licenses; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_oids; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_pids; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_refereed; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_sources; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_topics; - - --- Creating the shadow database, in case it doesn't exist -CREATE database IF NOT EXISTS ${stats_db_production_name}; - --- Creating new views -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.category AS SELECT * FROM ${stats_db_name}.category; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.concept AS SELECT * FROM ${stats_db_name}.concept; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.context AS SELECT * FROM ${stats_db_name}.context; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.country AS SELECT * FROM ${stats_db_name}.country; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.countrygdp AS SELECT * FROM ${stats_db_name}.countrygdp; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.creation_date AS SELECT * FROM ${stats_db_name}.creation_date; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset AS SELECT * FROM ${stats_db_name}.dataset; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_citations AS SELECT * FROM ${stats_db_name}.dataset_citations; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_classifications AS SELECT * FROM ${stats_db_name}.dataset_classifications; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_concepts AS SELECT * FROM ${stats_db_name}.dataset_concepts; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_datasources AS SELECT * FROM ${stats_db_name}.dataset_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_languages AS SELECT * FROM ${stats_db_name}.dataset_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_licenses AS SELECT * FROM ${stats_db_name}.dataset_licenses; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_oids AS SELECT * FROM ${stats_db_name}.dataset_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_pids AS SELECT * FROM ${stats_db_name}.dataset_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_refereed AS SELECT * FROM ${stats_db_name}.dataset_refereed; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_sources AS SELECT * FROM ${stats_db_name}.dataset_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_topics AS SELECT * FROM ${stats_db_name}.dataset_topics; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource AS SELECT * FROM ${stats_db_name}.datasource; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_languages AS SELECT * FROM ${stats_db_name}.datasource_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_oids AS SELECT * FROM ${stats_db_name}.datasource_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_organizations AS SELECT * FROM ${stats_db_name}.datasource_organizations; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_results AS SELECT * FROM ${stats_db_name}.datasource_results; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_sources AS SELECT * FROM ${stats_db_name}.datasource_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.funder AS SELECT * FROM ${stats_db_name}.funder; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.fundref AS SELECT * FROM ${stats_db_name}.fundref; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.numbers_country AS SELECT * FROM ${stats_db_name}.numbers_country; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization AS SELECT * FROM ${stats_db_name}.organization; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_datasources AS SELECT * FROM ${stats_db_name}.organization_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_pids AS SELECT * FROM ${stats_db_name}.organization_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_projects AS SELECT * FROM ${stats_db_name}.organization_projects; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_sources AS SELECT * FROM ${stats_db_name}.organization_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct AS SELECT * FROM ${stats_db_name}.otherresearchproduct; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_citations AS SELECT * FROM ${stats_db_name}.otherresearchproduct_citations; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_classifications AS SELECT * FROM ${stats_db_name}.otherresearchproduct_classifications; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_concepts AS SELECT * FROM ${stats_db_name}.otherresearchproduct_concepts; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_datasources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_languages AS SELECT * FROM ${stats_db_name}.otherresearchproduct_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_licenses AS SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_oids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_pids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_refereed AS SELECT * FROM ${stats_db_name}.otherresearchproduct_refereed; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_sources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_topics AS SELECT * FROM ${stats_db_name}.otherresearchproduct_topics; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project AS SELECT * FROM ${stats_db_name}.project; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_oids AS SELECT * FROM ${stats_db_name}.project_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_organizations AS SELECT * FROM ${stats_db_name}.project_organizations; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results AS SELECT * FROM ${stats_db_name}.project_results; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_resultcount AS SELECT * FROM ${stats_db_name}.project_resultcount; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results_publication AS SELECT * FROM ${stats_db_name}.project_results_publication; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication AS SELECT * FROM ${stats_db_name}.publication; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_citations AS SELECT * FROM ${stats_db_name}.publication_citations; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_classifications AS SELECT * FROM ${stats_db_name}.publication_classifications; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_concepts AS SELECT * FROM ${stats_db_name}.publication_concepts; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_datasources AS SELECT * FROM ${stats_db_name}.publication_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_languages AS SELECT * FROM ${stats_db_name}.publication_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_oids AS SELECT * FROM ${stats_db_name}.publication_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_pids AS SELECT * FROM ${stats_db_name}.publication_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_refereed AS SELECT * FROM ${stats_db_name}.publication_refereed; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_sources AS SELECT * FROM ${stats_db_name}.publication_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_topics AS SELECT * FROM ${stats_db_name}.publication_topics; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result AS SELECT * FROM ${stats_db_name}.result; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_affiliated_country AS SELECT * FROM ${stats_db_name}.result_affiliated_country; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_citations AS SELECT * FROM ${stats_db_name}.result_citations; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_classifications AS SELECT * FROM ${stats_db_name}.result_classifications; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_concepts AS SELECT * FROM ${stats_db_name}.result_concepts; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_datasources AS SELECT * FROM ${stats_db_name}.result_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_deposited_country AS SELECT * FROM ${stats_db_name}.result_deposited_country; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_fundercount AS SELECT * FROM ${stats_db_name}.result_fundercount; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_gold AS SELECT * FROM ${stats_db_name}.result_gold; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_greenoa AS SELECT * FROM ${stats_db_name}.result_greenoa; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_languages AS SELECT * FROM ${stats_db_name}.result_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_licenses AS SELECT * FROM ${stats_db_name}.result_licenses; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_oids AS SELECT * FROM ${stats_db_name}.result_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_organization AS SELECT * FROM ${stats_db_name}.result_organization; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_peerreviewed AS SELECT * FROM ${stats_db_name}.result_peerreviewed; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_pids AS SELECT * FROM ${stats_db_name}.result_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projectcount AS SELECT * FROM ${stats_db_name}.result_projectcount; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projects AS SELECT * FROM ${stats_db_name}.result_projects; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_refereed AS SELECT * FROM ${stats_db_name}.result_refereed; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_sources AS SELECT * FROM ${stats_db_name}.result_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_topics AS SELECT * FROM ${stats_db_name}.result_topics; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.rndexpediture AS SELECT * FROM ${stats_db_name}.rndexpediture; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.roarmap AS SELECT * FROM ${stats_db_name}.roarmap; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software AS SELECT * FROM ${stats_db_name}.software; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_citations AS SELECT * FROM ${stats_db_name}.software_citations; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_classifications AS SELECT * FROM ${stats_db_name}.software_classifications; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_concepts AS SELECT * FROM ${stats_db_name}.software_concepts; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_datasources AS SELECT * FROM ${stats_db_name}.software_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_languages AS SELECT * FROM ${stats_db_name}.software_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_licenses AS SELECT * FROM ${stats_db_name}.software_licenses; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_oids AS SELECT * FROM ${stats_db_name}.software_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_pids AS SELECT * FROM ${stats_db_name}.software_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_refereed AS SELECT * FROM ${stats_db_name}.software_refereed; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_sources AS SELECT * FROM ${stats_db_name}.software_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_topics AS SELECT * FROM ${stats_db_name}.software_topics; diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh new file mode 100644 index 000000000..3e510e87e --- /dev/null +++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh @@ -0,0 +1,16 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export PRODUCTION=$2 + +echo "Updating ${PRODUCTION} database" +impala-shell -q "create database if not exists ${PRODUCTION}" +impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f - +impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - +echo "Production db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index d744f18da..0d8ff7ee3 100644 --- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -6,7 +6,15 @@
stats_db_production_name - the name of the production schema + the name of the public production schema + + + monitor_db_name + the monitor database name + + + monitor_db_production_name + the name of the monitor public database stats_tool_api_url @@ -48,25 +56,26 @@ - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - stats_db_production_name=${stats_db_production_name} - - - - - - ${jobTracker} ${nameNode} - impala-shell.sh + updateProductionViews.sh + ${stats_db_name} ${stats_db_production_name} - computeProductionStats.sql - ${wf:appPath()}/scripts/computeProductionStats.sql - impala-shell.sh + updateProductionViews.sh + + + + + + + + ${jobTracker} + ${nameNode} + updateProductionViews.sh + ${monitor_db_name} + ${monitor_db_production_name} + updateProductionViews.sh diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh new file mode 100644 index 000000000..29b225e3c --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -0,0 +1,43 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +CONTEXT_API=$1 +TARGET_DB=$2 + +TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 contexts.csv +cat contexts.csv | cut -d , -f1 | xargs -I {} curl -L ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv +cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl -L ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv +cat contexts.csv | sed 's/^\(.*\),\(.*\)/\1,\1::other,\2/' >> categories.csv +cat categories.csv | grep -v ::other | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv + +echo "uploading context data to hdfs" +hdfs dfs -mkdir ${TMP} +hdfs dfs -copyFromLocal contexts.csv ${TMP} +hdfs dfs -copyFromLocal categories.csv ${TMP} +hdfs dfs -copyFromLocal concepts.csv ${TMP} +hdfs dfs -chmod -R 777 ${TMP} + +echo "Creating and populating impala tables" +impala-shell -q "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" +impala-shell -q "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','" +impala-shell -q "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','" +impala-shell -d ${TARGET_DB} -q "invalidate metadata" +impala-shell -q "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context" +impala-shell -q "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category" +impala-shell -q "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept" + +echo "Cleaning up" +hdfs dfs -rm -f -r -skipTrash ${TMP} +rm concepts.csv +rm categories.csv +rm contexts.csv + +echo "Finito!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql index 6c96317e6..77fbd3b18 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql @@ -23,19 +23,6 @@ CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS SELECT * FROM ${external_stats_db_name}.rndexpediture; -CREATE OR REPLACE VIEW ${stats_db_name}.context AS -SELECT * -FROM ${external_stats_db_name}.context; - -CREATE OR REPLACE VIEW ${stats_db_name}.category AS -SELECT * -FROM ${external_stats_db_name}.category; - -CREATE OR REPLACE VIEW ${stats_db_name}.concept AS -SELECT * -FROM ${external_stats_db_name}.concept; - - ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ -- Creation date of the database diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql index 62a158560..75b24b189 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql @@ -47,7 +47,10 @@ from ${openaire_db_name}.publication p where p.datainfo.deletedbyinference = false; CREATE TABLE ${stats_db_name}.publication_concepts AS -SELECT substr(p.id, 4) as id, contexts.context.id as concept +SELECT substr(p.id, 4) as id, case + when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id + when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') + when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context where p.datainfo.deletedbyinference = false; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index dcd5ad858..540cc03a5 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -54,7 +54,10 @@ FROM ${openaire_db_name}.dataset p where p.datainfo.deletedbyinference = false; CREATE TABLE ${stats_db_name}.dataset_concepts AS -SELECT substr(p.id, 4) as id, contexts.context.id as concept +SELECT substr(p.id, 4) as id, case + when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id + when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') + when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context where p.datainfo.deletedbyinference = false; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index fd5390e66..54345e074 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -54,7 +54,10 @@ FROM ${openaire_db_name}.software p where p.datainfo.deletedbyinference = false; CREATE TABLE ${stats_db_name}.software_concepts AS -SELECT substr(p.id, 4) AS id, contexts.context.id AS concept +SELECT substr(p.id, 4) as id, case + when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id + when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') + when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference = false; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index b359b596f..36ad5d92a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -52,7 +52,10 @@ FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance. where p.datainfo.deletedbyinference = false; CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS -SELECT substr(p.id, 4) AS id, contexts.context.id AS concept +SELECT substr(p.id, 4) as id, case + when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id + when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') + when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference = false; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 9c16f149d..321500e2c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -41,6 +41,10 @@ hive_timeout the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds. + + context_api_url + the base url of the context api (https://services.openaire.eu/openaire) + @@ -260,6 +264,19 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} + + + + + + + ${jobTracker} + ${nameNode} + contexts.sh + ${context_api_url} + ${stats_db_name} + contexts.sh + diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 190c9847e..ec8f9268c 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -28,6 +28,9 @@ dhp-graph-provision-scholexplorer dhp-blacklist dhp-stats-update + dhp-stats-promote + dhp-usage-stats-build + dhp-usage-raw-data-update dhp-broker-events dhp-doiboost diff --git a/pom.xml b/pom.xml index d9217c0af..fdd07c97a 100644 --- a/pom.xml +++ b/pom.xml @@ -359,7 +359,7 @@ ${dnet.openaire.broker.common} - + org.apache.cxf cxf-rt-transports-http 3.1.5 @@ -371,11 +371,6 @@ provided - - com.rabbitmq - amqp-client - 5.6.0 - com.jayway.jsonpath json-path @@ -408,20 +403,20 @@ 4.0 - - com.ximpleware - vtd-xml - ${vtd.version} - + + com.ximpleware + vtd-xml + ${vtd.version} + - - org.elasticsearch - elasticsearch-hadoop - 7.6.0 - + + org.elasticsearch + elasticsearch-hadoop + 7.6.0 + - + org.apache.oozie oozie-client ${dhp.oozie.version} @@ -463,6 +458,12 @@ ${apache.poi.version} + + org.json + json + 20180813 + + org.json4s json4s-jackson_2.11 @@ -687,6 +688,8 @@ UTF-8 UTF-8 3.6.0 + 1.8 + 1.8 2.22.2 2.0.1 cdh5.9.2 @@ -713,4 +716,4 @@ 4.5.3 4.0.1 - + \ No newline at end of file