diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java index 61bc3fbcaf..4c658e52ff 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java @@ -114,10 +114,10 @@ public class CreateActionSetSparkJob implements Serializable { if (!citing.equals(cited)) { relationList - .addAll( - getRelations( + .add( + getRelation( citing, - cited)); + cited, ModelConstants.CITES)); if (duplicate && value.getCiting().endsWith(".refs")) { citing = ID_PREFIX + IdentifierFactory @@ -125,7 +125,7 @@ public class CreateActionSetSparkJob implements Serializable { CleaningFunctions .normalizePidValue( "doi", value.getCiting().substring(0, value.getCiting().indexOf(".refs")))); - relationList.addAll(getRelations(citing, cited)); + relationList.add(getRelation(citing, cited, ModelConstants.CITES)); } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateOpenCitationsASTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateOpenCitationsASTest.java index 3e4ce750eb..523437950a 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateOpenCitationsASTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateOpenCitationsASTest.java @@ -99,7 +99,7 @@ public class CreateOpenCitationsASTest { .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(aa -> ((Relation) aa.getPayload())); - assertEquals(62, tmp.count()); + assertEquals(31, tmp.count()); // tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r))); @@ -131,7 +131,7 @@ public class CreateOpenCitationsASTest { .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(aa -> ((Relation) aa.getPayload())); - assertEquals(46, tmp.count()); + assertEquals(23, tmp.count()); // tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r))); @@ -241,7 +241,7 @@ public class CreateOpenCitationsASTest { assertEquals("resultResult", r.getRelType()); }); assertEquals(23, tmp.filter(r -> r.getRelClass().equals("Cites")).count()); - assertEquals(23, tmp.filter(r -> r.getRelClass().equals("IsCitedBy")).count()); + assertEquals(0, tmp.filter(r -> r.getRelClass().equals("IsCitedBy")).count()); } @@ -318,15 +318,15 @@ public class CreateOpenCitationsASTest { JavaRDD check = tmp.filter(r -> r.getSource().equals(doi1) || r.getTarget().equals(doi1)); - assertEquals(10, check.count()); + assertEquals(5, check.count()); - check.foreach(r -> { - if (r.getSource().equals(doi2) || r.getSource().equals(doi3) || r.getSource().equals(doi4) || - r.getSource().equals(doi5) || r.getSource().equals(doi6)) { - assertEquals(ModelConstants.IS_CITED_BY, r.getRelClass()); - assertEquals(doi1, r.getTarget()); - } - }); +// check.foreach(r -> { +// if (r.getSource().equals(doi2) || r.getSource().equals(doi3) || r.getSource().equals(doi4) || +// r.getSource().equals(doi5) || r.getSource().equals(doi6)) { +// assertEquals(ModelConstants.IS_CITED_BY, r.getRelClass()); +// assertEquals(doi1, r.getTarget()); +// } +// }); assertEquals(5, check.filter(r -> r.getSource().equals(doi1)).count()); check.filter(r -> r.getSource().equals(doi1)).foreach(r -> assertEquals(ModelConstants.CITES, r.getRelClass())); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java index 19b9859648..0fc8cb3907 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java @@ -72,33 +72,28 @@ public class PrepareResultCommunitySet { String outputPath, OrganizationMap organizationMap) { - Dataset relationAffiliation = readPath(spark, inputPath, Relation.class) - .filter( - (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && - r.getRelClass().equalsIgnoreCase(ModelConstants.HAS_AUTHOR_INSTITUTION)); + Dataset relation = readPath(spark, inputPath, Relation.class); + relation.createOrReplaceTempView("relation"); - Dataset relationOrganization = readPath(spark, inputPath, Relation.class) - .filter( - (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && - r.getRelClass().equalsIgnoreCase(ModelConstants.MERGES)); + String query = "SELECT result_organization.source resultId, result_organization.target orgId, org_set merges " + + "FROM (SELECT source, target " + + " FROM relation " + + " WHERE datainfo.deletedbyinference = false " + + " AND lower(relClass) = '" + + ModelConstants.HAS_AUTHOR_INSTITUTION.toLowerCase() + + "') result_organization " + + "LEFT JOIN (SELECT source, collect_set(target) org_set " + + " FROM relation " + + " WHERE datainfo.deletedbyinference = false " + + " AND lower(relClass) = '" + + ModelConstants.MERGES.toLowerCase() + + "' " + + " GROUP BY source) organization_organization " + + "ON result_organization.target = organization_organization.source "; - Dataset result_organizationset = relationAffiliation - .joinWith( - relationOrganization, - relationAffiliation.col("target").equalTo(relationOrganization.col("source")), - "left") - .groupByKey((MapFunction, String>) t2 -> t2._2().getSource(), Encoders.STRING()) - .mapGroups((MapGroupsFunction, ResultOrganizations>) (k, it) -> { - ResultOrganizations rOrgs = new ResultOrganizations(); - rOrgs.setOrgId(k); - Tuple2 first = it.next(); - rOrgs.setResultId(first._1().getSource()); - ArrayList merges = new ArrayList<>(); - merges.add(first._2().getTarget()); - it.forEachRemaining(t -> merges.add(t._2().getTarget())); - rOrgs.setMerges(merges); - return rOrgs; - }, Encoders.bean(ResultOrganizations.class)); + Dataset result_organizationset = spark + .sql(query) + .as(Encoders.bean(ResultOrganizations.class)); result_organizationset .map(mapResultCommunityFn(organizationMap), Encoders.bean(ResultCommunityList.class)) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/GetDatasourceFromCountry.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/GetDatasourceFromCountry.java index a69b1a8bf5..85e4461211 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/GetDatasourceFromCountry.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/GetDatasourceFromCountry.java @@ -78,6 +78,8 @@ public class GetDatasourceFromCountry implements Serializable { Encoders.bean(Organization.class)) .filter( (FilterFunction) o -> !o.getDataInfo().getDeletedbyinference() && + o.getCountry() != null && + o.getCountry().getClassid() != null && o.getCountry().getClassid().length() > 0 && o.getCountry().getClassid().equals(country)); diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh index b66ab47e03..a436d0380b 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -9,6 +9,8 @@ fi CONTEXT_API=$1 TARGET_DB=$2 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=4831838208 -hiveconf spark.yarn.executor.memoryOverhead=450" + TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 foo +hive -f foo +echo "Updated shadow database" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh index 93faa43d63..2f1eefa0c3 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh @@ -7,13 +7,17 @@ then fi export TARGET=$1 -export SCRIPT_PATH=$2 +export STATS_EXT=$2 +export SCRIPT_PATH=$3 + +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228 -hiveconf hive.auto.convert.join=false" +export HADOOP_USER_NAME="oozie" echo "Getting file from " $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH echo "Creating indicators" -impala-shell -q "invalidate metadata" -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -c -f - -cat step16-createIndicatorsTables.sql | impala-shell -d $TARGET -f - -echo "Indicators created" \ No newline at end of file +hive $HIVE_OPTS --database ${TARGET} -e "show tables" | grep -v WARN | sed "s/STATS_EXT/${STATS_EXT}/g" |sed "s/^\(.*\)/analyze table ${TARGET}.\1 compute statistics;/" > foo +hive $HIVE_OPTS -f foo +hive $HIVE_OPTS --database ${TARGET} -f step16-createIndicatorsTables.sql +echo "Indicators created" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor-post.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor-post.sh new file mode 100644 index 0000000000..5863625a1b --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor-post.sh @@ -0,0 +1,19 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export SHADOW=$2 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" + +echo "Updating shadow database" +hive -e "drop database if exists ${SHADOW} cascade" +hive -e "create database if not exists ${SHADOW}" +hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo +hive -f foo +echo "Updated shadow database" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh index c5bda6d391..440aac7704 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/monitor.sh @@ -11,15 +11,15 @@ export TARGET=$2 export SHADOW=$3 export SCRIPT_PATH=$4 -echo "Getting file from " $4 -hdfs dfs -copyToLocal $4 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" + +echo "Getting file from " $SCRIPT_PATH +hdfs dfs -copyToLocal $SCRIPT_PATH echo "Creating monitor database" -cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f - -echo "Impala shell finished" +#cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo +cat step20-createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g1" > foo +hive $HIVE_OPTS -f foo +echo "Hive shell finished" -echo "Updating shadow monitor database" -impala-shell -q "create database if not exists ${SHADOW}" -impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - -echo "Shadow db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh index db8d39af2a..5863625a1b 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-post.sh @@ -7,15 +7,13 @@ then fi export SOURCE=$1 -export TARGET=$2 -export SHADOW=$3 +export SHADOW=$2 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" -impala-shell -q "invalidate metadata;" -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f - -echo "Impala shell finished" - -echo "Updating shadow observatory database" -impala-shell -q "create database if not exists ${SHADOW}" -impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f - -impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f - -echo "Shadow db ready!" \ No newline at end of file +echo "Updating shadow database" +hive -e "drop database if exists ${SHADOW} cascade" +hive -e "create database if not exists ${SHADOW}" +hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" > foo +hive -f foo +echo "Updated shadow database" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh index 55a308c50c..37671cce83 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh @@ -10,7 +10,11 @@ export SOURCE=$1 export TARGET=$2 export SHADOW=$3 +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" + echo "Creating observatory database" -impala-shell -q "drop database if exists ${TARGET} cascade" -impala-shell -q "create database if not exists ${TARGET}" -impala-shell -d ${SOURCE} -q "show tables" --delimited | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f - +hive -e "drop database if exists ${TARGET} cascade" +hive -e "create database if not exists ${TARGET}" +hive $HIVE_OPTS --database ${SOURCE} -e "show tables" | grep -v WARN | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" > foo +hive -f foo diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index 753d61ca0a..beec310826 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -44,7 +44,7 @@ from ( from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r join ${stats_db_name}.result res on res.id=r.id; -create table ${stats_db_name}.result_apc as +create table ${stats_db_name}.result_apc STORED AS PARQUET as select r.id, r.amount, r.currency from ( select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency @@ -52,4 +52,4 @@ from ( join ${stats_db_name}.result res on res.id=r.id where r.amount is not null; -create view ${stats_db_name}.issn_gold_oa_dataset as select * from stats_ext.issn_gold_oa_dataset; \ No newline at end of file +create view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 1bda076295..4fd941e5da 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -1,5 +1,5 @@ -- Sprint 1 ---- -create table indi_pub_green_oa stored as parquet as +create table if not exists indi_pub_green_oa stored as parquet as select distinct p.id, coalesce(green_oa, 0) as green_oa from publication p left outer join ( @@ -12,9 +12,9 @@ from publication p or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp on p.id= tmp.id; -compute stats indi_pub_green_oa; +ANALYZE TABLE indi_pub_green_oa COMPUTE STATISTICS; -create table indi_pub_grey_lit stored as parquet as +create table if not exists indi_pub_grey_lit stored as parquet as select distinct p.id, coalesce(grey_lit, 0) as grey_lit from publication p left outer join ( @@ -25,9 +25,9 @@ from publication p not exists (select 1 from result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id; -compute stats indi_pub_grey_lit; +ANALYZE TABLE indi_pub_grey_lit COMPUTE STATISTICS; -create table indi_pub_doi_from_crossref stored as parquet as +create table if not exists indi_pub_doi_from_crossref stored as parquet as select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref from publication p left outer join @@ -36,10 +36,10 @@ from publication p where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp on tmp.id=p.id; -compute stats indi_pub_doi_from_crossref; +ANALYZE TABLE indi_pub_doi_from_crossref COMPUTE STATISTICS; -- Sprint 2 ---- -create table indi_result_has_cc_licence stored as parquet as +create table if not exists indi_result_has_cc_licence stored as parquet as select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license from result r left outer join (select r.id, license.type as lic from result r @@ -47,9 +47,9 @@ from result r where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp on r.id= tmp.id; -compute stats indi_result_has_cc_licence; +ANALYZE TABLE indi_result_has_cc_licence COMPUTE STATISTICS; -create table indi_result_has_cc_licence_url stored as parquet as +create table if not exists indi_result_has_cc_licence_url stored as parquet as select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url from result r left outer join (select r.id, lower(parse_url(license.type, "HOST")) as lic_host @@ -58,31 +58,31 @@ from result r WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp on r.id= tmp.id; -compute stats indi_result_has_cc_licence_url; +ANALYZE TABLE indi_result_has_cc_licence_url COMPUTE STATISTICS; -create table indi_pub_has_abstract stored as parquet as -select distinct publication.id, coalesce(abstract, 1) has_abstract +create table if not exists indi_pub_has_abstract stored as parquet as +select distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract from publication; -compute stats indi_pub_has_abstract; +ANALYZE TABLE indi_pub_has_abstract COMPUTE STATISTICS; -create table indi_result_with_orcid stored as parquet as +create table if not exists indi_result_with_orcid stored as parquet as select distinct r.id, coalesce(has_orcid, 0) as has_orcid from result r left outer join (select id, 1 as has_orcid from result_orcid) tmp on r.id= tmp.id; -compute stats indi_result_with_orcid; +ANALYZE TABLE indi_result_with_orcid COMPUTE STATISTICS; ---- Sprint 3 ---- -create table indi_funded_result_with_fundref stored as parquet as +create table if not exists indi_funded_result_with_fundref stored as parquet as select distinct r.result as id, coalesce(fundref, 0) as fundref from project_results r left outer join (select distinct result, 1 as fundref from project_results where provenance='Harvested') tmp on r.result= tmp.result; -compute stats indi_funded_result_with_fundref; +ANALYZE TABLE indi_funded_result_with_fundref COMPUTE STATISTICS; -- create table indi_result_org_collab stored as parquet as -- select o1.organization org1, o2.organization org2, count(distinct o1.id) as collaborations @@ -92,77 +92,59 @@ compute stats indi_funded_result_with_fundref; -- -- compute stats indi_result_org_collab; -- -create table indi_result_org_collab stored as parquet as -with tmp as ( -select distinct ro.organization organization, ro.id from result_organization ro -join organization o on o.id=ro.organization where o.name is not null) +create TEMPORARY TABLE tmp AS SELECT ro.organization organization, ro.id from result_organization ro +join organization o on o.id=ro.organization where o.name is not null; + +create table if not exists indi_result_org_collab stored as parquet as select o1.organization org1, o2.organization org2, count(o1.id) as collaborations from tmp as o1 -join tmp as o2 on o1.id=o2.id and o1.organization!=o2.organization -group by org1, org2; +join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization +group by o1.organization, o2.organization; -compute stats indi_result_org_collab; +drop table tmp purge; --- create table indi_result_org_country_collab stored as parquet as --- with tmp as --- (select o.id as id, o.country , ro.id as result,r.type from organization o --- join result_organization ro on o.id=ro.organization --- join result r on r.id=ro.id where o.country <> 'UNKNOWN') --- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations --- from tmp as o1 --- join tmp as o2 on o1.result=o2.result --- where o1.id<>o2.id and o1.country<>o2.country --- group by o1.id, o1.type,o2.country; --- --- compute stats indi_result_org_country_collab; --- -create table indi_result_org_country_collab stored as parquet as -with tmp as -(select distinct ro.organization organization, ro.id, o.country from result_organization ro -join organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null) +ANALYZE TABLE indi_result_org_collab COMPUTE STATISTICS; + +create TEMPORARY TABLE tmp AS +select distinct ro.organization organization, ro.id, o.country from result_organization ro +join organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null; + +create table if not exists indi_result_org_country_collab stored as parquet as select o1.organization org1,o2.country country2, count(o1.id) as collaborations from tmp as o1 join tmp as o2 on o1.id=o2.id where o1.id=o2.id and o1.country!=o2.country group by o1.organization, o1.id, o2.country; -compute stats indi_result_org_country_collab; +drop table tmp purge; --- create table indi_result_org_collab stored as parquet as --- with tmp as --- (select o.id, ro.id as result,r.type from organization o --- join result_organization ro on o.id=ro.organization --- join result r on r.id=ro.id) --- select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations --- from tmp as o1 --- join tmp as o2 on o1.result=o2.result --- where o1.id<>o2.id --- group by o1.id, o2.id, o1.type; --- --- compute stats indi_result_org_collab; --- -create table indi_project_collab_org stored as parquet as +ANALYZE TABLE indi_result_org_country_collab COMPUTE STATISTICS; + +create table if not exists indi_project_collab_org stored as parquet as select o1.id org1,o2.id org2, count(distinct o1.project) as collaborations from organization_projects as o1 join organization_projects as o2 on o1.project=o2.project where o1.id!=o2.id group by o1.id, o2.id; -compute stats indi_project_collab_org; +ANALYZE TABLE indi_project_collab_org COMPUTE STATISTICS; -create table indi_project_collab_org_country stored as parquet as - with tmp as - (select o.id organization, o.country , ro.project as project from organization o +create TEMPORARY TABLE tmp AS +select o.id organization, o.country , ro.project as project from organization o join organization_projects ro on o.id=ro.id - and o.country <> 'UNKNOWN') + and o.country <> 'UNKNOWN'; + +create table if not exists indi_project_collab_org_country stored as parquet as select o1.organization org1,o2.country country2, count(distinct o1.project) as collaborations from tmp as o1 join tmp as o2 on o1.project=o2.project where o1.organization<>o2.organization and o1.country<>o2.country group by o1.organization, o2.country; -compute stats indi_project_collab_org_country; +drop table tmp purge; -create table indi_funder_country_collab stored as parquet as +ANALYZE TABLE indi_project_collab_org_country COMPUTE STATISTICS; + +create table if not exists indi_funder_country_collab stored as parquet as with tmp as (select funder, project, country from organization_projects op join organization o on o.id=op.id join project p on p.id=op.project @@ -173,72 +155,50 @@ from tmp as f1 where f1.country<>f2.country group by f1.funder, f2.country, f1.country; -compute stats indi_funder_country_collab; --- --- create table indi_result_country_collab stored as parquet as --- with tmp as --- (select country, ro.id as result,r.type from organization o --- join result_organization ro on o.id=ro.organization --- join result r on r.id=ro.id where country <> 'UNKNOWN') --- select o1.country country1, o2.country country2, o1.type, count(distinct o1.result) as collaborations --- from tmp as o1 --- join tmp as o2 on o1.result=o2.result --- where o1.country<>o2.country --- group by o1.country, o2.country, o1.type; --- --- compute stats indi_result_country_collab; +ANALYZE TABLE indi_funder_country_collab COMPUTE STATISTICS; -create table indi_result_country_collab stored as parquet as -with tmp as - (select distinct country, ro.id as result from organization o +create TEMPORARY TABLE tmp AS +select distinct country, ro.id as result from organization o join result_organization ro on o.id=ro.organization - where country <> 'UNKNOWN' and o.name is not null) + where country <> 'UNKNOWN' and o.name is not null; + +create table if not exists indi_result_country_collab stored as parquet as select o1.country country1, o2.country country2, count(o1.result) as collaborations from tmp as o1 join tmp as o2 on o1.result=o2.result where o1.country<>o2.country group by o1.country, o2.country; -compute stats indi_result_country_collab; +drop table tmp purge; + +ANALYZE TABLE indi_result_country_collab COMPUTE STATISTICS; ---- Sprint 4 ---- -create table indi_pub_diamond stored as parquet as +create table if not exists indi_pub_diamond stored as parquet as select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal from publication_datasources pd left outer join ( select pd.id, 1 as in_diamond_journal from publication_datasources pd join datasource d on d.id=pd.datasource - join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) + join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp on pd.id=tmp.id; -compute stats indi_pub_diamond; +ANALYZE TABLE indi_pub_diamond COMPUTE STATISTICS; ---create table indi_pub_hybrid stored as parquet as ---select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid ---from publication_datasources pd --- left outer join ( --- select pd.id, 1 as is_hybrid from publication_datasources pd --- join datasource d on d.id=pd.datasource --- join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) --- and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp --- on pd.id=tmp.id; --- ---compute stats indi_pub_hybrid; - -create table indi_pub_in_transformative stored as parquet as +create table if not exists indi_pub_in_transformative stored as parquet as select distinct pd.id, coalesce(is_transformative, 0) as is_transformative from publication pd left outer join ( select pd.id, 1 as is_transformative from publication_datasources pd join datasource d on d.id=pd.datasource - join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) + join STATS_EXT.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) and ps.is_transformative_journal=true) tmp on pd.id=tmp.id; -compute stats indi_pub_in_transformative; +ANALYZE TABLE indi_pub_in_transformative COMPUTE STATISTICS; -create table indi_pub_closed_other_open stored as parquet as +create table if not exists indi_pub_closed_other_open stored as parquet as select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri left outer join (select ri.id, 1 as pub_closed_other_open from result_instance ri @@ -248,187 +208,23 @@ select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_op (p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp on tmp.id=ri.id; -compute stats indi_pub_closed_other_open; +ANALYZE TABLE indi_pub_closed_other_open COMPUTE STATISTICS; ---- Sprint 5 ---- -create table indi_result_no_of_copies stored as parquet as +create table if not exists indi_result_no_of_copies stored as parquet as select id, count(id) as number_of_copies from result_instance group by id; -compute stats indi_result_no_of_copies; +ANALYZE TABLE indi_result_no_of_copies COMPUTE STATISTICS; ---- Sprint 6 ---- ---create table indi_pub_gold_oa stored as parquet as ---WITH gold_oa AS ( --- SELECT issn_l, journal_is_in_doaj,journal_is_oa, issn_1 as issn --- FROM stats_ext.oa_journals --- WHERE issn_1 != "" --- UNION ALL --- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_2 as issn --- FROM stats_ext.oa_journals --- WHERE issn_2 != "" ), ---issn AS ( --- SELECT * FROM --- (SELECT id, issn_printed as issn --- FROM datasource WHERE issn_printed IS NOT NULL --- UNION --- SELECT id, issn_online as issn --- FROM datasource WHERE issn_online IS NOT NULL) as issn --- WHERE LENGTH(issn) > 7) ---SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold ---FROM publication_datasources pd ---LEFT OUTER JOIN ( --- SELECT pd.id, 1 as is_gold FROM publication_datasources pd --- JOIN issn on issn.id=pd.datasource --- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; - ---compute stats indi_pub_gold_oa; --- ---create table indi_datasets_gold_oa stored as parquet as ---WITH gold_oa AS ( --- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn --- FROM stats_ext.oa_journals --- WHERE issn_1 != "" --- UNION --- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn --- FROM stats_ext.oa_journals --- WHERE issn_2 != "" ), ---issn AS ( --- SELECT * --- FROM ( --- SELECT id,issn_printed as issn --- FROM datasource --- WHERE issn_printed IS NOT NULL --- UNION --- SELECT id, issn_online as issn --- FROM datasource --- WHERE issn_online IS NOT NULL ) as issn --- WHERE LENGTH(issn) > 7) ---SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold ---FROM dataset_datasources pd ---LEFT OUTER JOIN ( --- SELECT pd.id, 1 as is_gold FROM dataset_datasources pd --- JOIN issn on issn.id=pd.datasource --- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; --- ---compute stats indi_datasets_gold_oa; - ---create table indi_software_gold_oa stored as parquet as ---WITH gold_oa AS ( --- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn --- FROM stats_ext.oa_journals --- WHERE issn_1 != "" --- UNION --- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn --- FROM stats_ext.oa_journals --- WHERE issn_2 != "" ), ---issn AS ( --- SELECT * --- FROM ( --- SELECT id,issn_printed as issn --- FROM datasource --- WHERE issn_printed IS NOT NULL --- UNION --- SELECT id, issn_online as issn --- FROM datasource --- WHERE issn_online IS NOT NULL ) as issn --- WHERE LENGTH(issn) > 7) ---SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold ---FROM software_datasources pd ---LEFT OUTER JOIN ( --- SELECT pd.id, 1 as is_gold FROM software_datasources pd --- JOIN issn on issn.id=pd.datasource --- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; --- ---compute stats indi_software_gold_oa; - ---create table indi_org_findable stored as parquet as ---with result_with_pid as ( --- select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro --- join result_pids rp on rp.id=ro.id --- group by ro.organization), ---result_has_abstract as ( --- select ro.organization organization, count(distinct rp.id) no_result_with_abstract from result_organization ro --- join result rp on rp.id=ro.id where rp.abstract=true --- group by ro.organization), ---allresults as ( --- select organization, count(distinct id) no_allresults from result_organization --- group by organization), ---result_with_pid_share as ( --- select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults pid_share --- from allresults --- join result_with_pid on result_with_pid.organization=allresults.organization), ---result_with_abstract_share as ( --- select allresults.organization, result_has_abstract.no_result_with_abstract/allresults.no_allresults abstract_share --- from allresults --- join result_has_abstract on result_has_abstract.organization=allresults.organization) ---select allresults.organization, coalesce((pid_share+abstract_share)/2,pid_share) org_findable ---from allresults ---join result_with_pid_share on result_with_pid_share.organization=allresults.organization ---left outer join ( --- select organization, abstract_share from result_with_abstract_share) tmp on tmp.organization=allresults.organization; --- ---compute stats indi_org_findable; --- ---create table indi_org_openess stored as parquet as ---WITH datasets_oa as ( --- SELECT ro.organization, count(dg.id) no_oadatasets FROM indi_datasets_gold_oa dg --- join result_organization ro on dg.id=ro.id --- join dataset ds on dg.id=ds.id --- WHERE dg.is_gold=1 --- group by ro.organization), ---software_oa as ( --- SELECT ro.organization, count(dg.id) no_oasoftware FROM indi_software_gold_oa dg --- join result_organization ro on dg.id=ro.id --- join software ds on dg.id=ds.id --- WHERE dg.is_gold=1 --- group by ro.organization), ---pubs_oa as ( --- SELECT ro.organization, count(dg.id) no_oapubs FROM indi_pub_gold_oa dg --- join result_organization ro on dg.id=ro.id --- join publication ds on dg.id=ds.id --- where dg.is_gold=1 --- group by ro.organization), ---allpubs as ( --- SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro --- join publication ps on ps.id=ro.id --- group by ro.organization), ---alldatasets as ( --- SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro --- join dataset ps on ps.id=ro.id --- group by ro.organization), ---allsoftware as ( --- SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro --- join software ps on ps.id=ro.id --- group by ro.organization), ---allpubsshare as ( --- select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs --- join pubs_oa on allpubs.organization=pubs_oa.organization), ---alldatasetssshare as ( --- select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets c --- from alldatasets --- join datasets_oa on alldatasets.organization=datasets_oa.organization), ---allsoftwaresshare as ( --- select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s --- from allsoftware --- join software_oa on allsoftware.organization=software_oa.organization) ---select allpubsshare.organization, coalesce((c+p+s)/3, p) org_openess ---FROM allpubsshare ---left outer join ( --- select organization,c from --- alldatasetssshare) tmp on tmp.organization=allpubsshare.organization ---left outer join ( --- select organization,s from allsoftwaresshare) tmp1 on tmp1.organization=allpubsshare.organization; --- ---compute stats indi_org_openess; --- -create table indi_pub_hybrid_oa_with_cc stored as parquet as +create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as WITH hybrid_oa AS ( SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn - FROM stats_ext.plan_s_jn + FROM STATS_EXT.plan_s_jn WHERE issn_print != "" UNION ALL SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn - FROM stats_ext.plan_s_jn + FROM STATS_EXT.plan_s_jn WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)), issn AS ( SELECT * @@ -436,7 +232,7 @@ create table indi_pub_hybrid_oa_with_cc stored as parquet as SELECT id, issn_printed as issn FROM datasource WHERE issn_printed IS NOT NULL - UNION + UNION ALL SELECT id,issn_online as issn FROM datasource WHERE issn_online IS NOT NULL ) as issn @@ -451,52 +247,51 @@ FROM publication_datasources pd JOIN indi_result_has_cc_licence cc on pd.id=cc.id where cc.has_cc_license=1) tmp on pd.id=tmp.id; -compute stats indi_pub_hybrid_oa_with_cc; +ANALYZE TABLE indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; -create table indi_pub_downloads stored as parquet as +create table if not exists indi_pub_downloads stored as parquet as SELECT result_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join publication on result_id=id where downloads>0 GROUP BY result_id order by no_downloads desc; -compute stats indi_pub_downloads; +ANALYZE TABLE indi_pub_downloads COMPUTE STATISTICS; -create table indi_pub_downloads_datasource stored as parquet as +create table if not exists indi_pub_downloads_datasource stored as parquet as SELECT result_id, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join publication on result_id=id where downloads>0 GROUP BY result_id, repository_id order by result_id; -compute stats indi_pub_downloads_datasource; +ANALYZE TABLE indi_pub_downloads_datasource COMPUTE STATISTICS; -create table indi_pub_downloads_year stored as parquet as -SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us - join publication on result_id=id where downloads>0 -GROUP BY result_id, `year` -order by `year` asc; +create table if not exists indi_pub_downloads_year stored as parquet as +SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_downloads +from openaire_prod_usage_stats.usage_stats us +join publication on result_id=id where downloads>0 +GROUP BY result_id, substring(us.`date`, 1,4); -compute stats indi_pub_downloads_year; +ANALYZE TABLE indi_pub_downloads_year COMPUTE STATISTICS; -create table indi_pub_downloads_datasource_year stored as parquet as +create table if not exists indi_pub_downloads_datasource_year stored as parquet as SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us - join publication on result_id=id +join publication on result_id=id where downloads>0 -GROUP BY result_id, repository_id, `year` -order by `year` asc, result_id; +GROUP BY result_id, repository_id, substring(us.`date`, 1,4); -compute stats indi_pub_downloads_datasource_year; +ANALYZE TABLE indi_pub_downloads_datasource_year COMPUTE STATISTICS; ---- Sprint 7 ---- -create table indi_pub_gold_oa stored as parquet as +create table if not exists indi_pub_gold_oa stored as parquet as WITH gold_oa AS ( SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn FROM - stats_ext.oa_journals + STATS_EXT.oa_journals WHERE issn_1 != "" UNION @@ -506,7 +301,7 @@ create table indi_pub_gold_oa stored as parquet as journal_is_oa, issn_2 as issn FROM - stats_ext.oa_journals + STATS_EXT.oa_journals WHERE issn_2 != "" ), issn AS ( SELECT * @@ -518,7 +313,7 @@ create table indi_pub_gold_oa stored as parquet as datasource WHERE issn_printed IS NOT NULL - UNION + UNION ALL SELECT id, issn_online as issn @@ -538,9 +333,9 @@ FROM JOIN gold_oa on issn.issn = gold_oa.issn) tmp on pd.id=tmp.id; -compute stats indi_pub_gold_oa; +ANALYZE TABLE indi_pub_gold_oa COMPUTE STATISTICS; -create table indi_pub_hybrid stored as parquet as +create table if not exists indi_pub_hybrid stored as parquet as WITH gold_oa AS ( SELECT issn_l, journal_is_in_doaj, @@ -548,7 +343,7 @@ create table indi_pub_hybrid stored as parquet as issn_1 as issn, has_apc FROM - stats_ext.oa_journals + STATS_EXT.oa_journals WHERE issn_1 != "" UNION @@ -559,7 +354,7 @@ create table indi_pub_hybrid stored as parquet as issn_2 as issn, has_apc FROM - stats_ext.oa_journals + STATS_EXT.oa_journals WHERE issn_2 != "" ), issn AS ( SELECT * @@ -571,7 +366,7 @@ create table indi_pub_hybrid stored as parquet as datasource WHERE issn_printed IS NOT NULL - UNION + UNION ALL SELECT id, issn_online as issn @@ -591,15 +386,15 @@ from publication_datasources pd where (gold_oa.journal_is_in_doaj=false or gold_oa.journal_is_oa=false))tmp on pd.id=tmp.id; -compute stats indi_pub_hybrid; +ANALYZE TABLE indi_pub_hybrid COMPUTE STATISTICS; -create table indi_org_fairness stored as parquet as +create table if not exists indi_org_fairness stored as parquet as --return results with PIDs, and rich metadata group by organization with result_fair as (select ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro join result r on r.id=ro.id --join result_pids rp on r.id=rp.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 group by ro.organization), --return all results group by organization allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro @@ -611,16 +406,16 @@ select allresults.organization, result_fair.no_result_fair/allresults.no_allresu from allresults join result_fair on result_fair.organization=allresults.organization; -compute stats indi_org_fairness; +ANALYZE TABLE indi_org_fairness COMPUTE STATISTICS; -create table indi_org_fairness_pub_pr stored as parquet as +create table if not exists indi_org_fairness_pub_pr stored as parquet as with result_fair as (select ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro join publication p on p.id=ro.id join indi_pub_doi_from_crossref dc on dc.id=p.id join indi_pub_grey_lit gl on gl.id=p.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 and dc.doi_from_crossref=1 and gl.grey_lit=0 group by ro.organization), allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro @@ -632,150 +427,180 @@ select allresults.organization, result_fair.no_result_fair/allresults.no_allresu from allresults join result_fair on result_fair.organization=allresults.organization; -compute stats indi_org_fairness_pub_pr; +ANALYZE TABLE indi_org_fairness_pub_pr COMPUTE STATISTICS; -create table indi_org_fairness_pub_year stored as parquet as - with result_fair as - (select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro - join publication p on p.id=ro.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 - group by ro.organization, year), - allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro - join publication p on p.id=ro.id +CREATE TEMPORARY table result_fair as + select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro + join result p on p.id=ro.id + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 + group by ro.organization, year; + +CREATE TEMPORARY TABLE allresults as select year, organization, count(distinct ro.id) no_allresults from result_organization ro + join result p on p.id=ro.id where cast(year as int)>2003 - group by organization, year) + group by organization, year; + +create table if not exists indi_org_fairness_pub_year stored as parquet as select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness from allresults join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; -compute stats indi_org_fairness_pub_year; +DROP table result_fair purge; +DROP table allresults purge; -create table indi_org_fairness_pub as -with result_fair as - (select ro.organization organization, count(distinct ro.id) no_result_fair - from result_organization ro - join publication p on p.id=ro.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) - and (authors>0) and cast(year as int)>2003 - group by ro.organization), - allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro - join publication p on p.id=ro.id - where cast(year as int)>2003 - group by organization) +ANALYZE TABLE indi_org_fairness_pub_year COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE result_fair as + select ro.organization organization, count(distinct ro.id) no_result_fair + from result_organization ro + join result p on p.id=ro.id + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) + and (authors>0) and cast(year as int)>2003 + group by ro.organization; + +CREATE TEMPORARY TABLE allresults as + select organization, count(distinct ro.id) no_allresults from result_organization ro + join result p on p.id=ro.id + where cast(year as int)>2003 + group by organization; + +create table if not exists indi_org_fairness_pub as select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness -from allresults - join result_fair on result_fair.organization=allresults.organization; +from allresults join result_fair on result_fair.organization=allresults.organization; -compute stats indi_org_fairness_pub; +DROP table result_fair purge; +DROP table allresults purge; -create table indi_org_fairness_year stored as parquet as - with result_fair as - (select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro +ANALYZE TABLE indi_org_fairness_pub COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE result_fair as + select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro join result r on r.id=ro.id join result_pids rp on r.id=rp.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 - group by ro.organization, year), - allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro + where (title is not null) and (publisher is not null) and (abstract=true) and (year is not null) and (authors>0) and cast(year as int)>2003 + group by ro.organization, year; + +CREATE TEMPORARY TABLE allresults as + select year, organization, count(distinct ro.id) no_allresults from result_organization ro join result r on r.id=ro.id where cast(year as int)>2003 - group by organization, year) ---return results_fair/all_results -select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness -from allresults - join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; + group by organization, year; -compute stats indi_org_fairness_year; +create table if not exists indi_org_fairness_year stored as parquet as + select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness + from allresults + join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; -create table indi_org_findable_year stored as parquet as ---return results with PIDs group by organization,year - with result_with_pid as - (select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro +DROP table result_fair purge; +DROP table allresults purge; + +ANALYZE TABLE indi_org_fairness_year COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE result_with_pid as + select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro join result_pids rp on rp.id=ro.id join result r on r.id=rp.id where cast(year as int) >2003 - group by ro.organization, year), ---return all results group by organization,year - allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro + group by ro.organization, year; + +CREATE TEMPORARY TABLE allresults as + select year, organization, count(distinct ro.id) no_allresults from result_organization ro join result r on r.id=ro.id where cast(year as int) >2003 - group by organization, year) ---return results_with_pid/all_results + group by organization, year; + +create table if not exists indi_org_findable_year stored as parquet as select allresults.year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable from allresults join result_with_pid on result_with_pid.organization=allresults.organization and result_with_pid.year=allresults.year; -compute stats indi_org_findable_year; +DROP table result_with_pid purge; +DROP table allresults purge; -create table indi_org_findable stored as parquet as ---return results with PIDs group by organization - with result_with_pid as - (select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro +ANALYZE TABLE indi_org_findable_year COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE result_with_pid as +select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro join result_pids rp on rp.id=ro.id join result r on r.id=rp.id where cast(year as int) >2003 - group by ro.organization), ---return all results group by organization - allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro + group by ro.organization; + +CREATE TEMPORARY TABLE allresults as +select organization, count(distinct ro.id) no_allresults from result_organization ro join result r on r.id=ro.id where cast(year as int) >2003 - group by organization) ---return results_with_pid/all_results + group by organization; + +create table if not exists indi_org_findable stored as parquet as select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable from allresults join result_with_pid on result_with_pid.organization=allresults.organization; -compute stats indi_org_findable; +DROP table result_with_pid purge; +DROP table allresults purge; -create table indi_org_openess stored as parquet as - WITH pubs_oa as ( - SELECT ro.organization, count(distinct r.id) no_oapubs FROM publication r +ANALYZE TABLE indi_org_findable COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE pubs_oa as +SELECT ro.organization, count(distinct r.id) no_oapubs FROM publication r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization), - datasets_oa as ( - SELECT ro.organization, count(distinct r.id) no_oadatasets FROM dataset r + group by ro.organization; + +CREATE TEMPORARY TABLE datasets_oa as +SELECT ro.organization, count(distinct r.id) no_oadatasets FROM dataset r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization), - software_oa as ( - SELECT ro.organization, count(distinct r.id) no_oasoftware FROM software r + group by ro.organization; + +CREATE TEMPORARY TABLE software_oa as +SELECT ro.organization, count(distinct r.id) no_oasoftware FROM software r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization), - allpubs as ( - SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro + group by ro.organization; + +CREATE TEMPORARY TABLE allpubs as +SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro join publication ps on ps.id=ro.id where cast(ps.year as int)>2003 - group by ro.organization), - alldatasets as ( - SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro + group by ro.organization; + +CREATE TEMPORARY TABLE alldatasets as +SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro join dataset ps on ps.id=ro.id where cast(ps.year as int)>2003 - group by ro.organization), - allsoftware as ( - SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro + group by ro.organization; + +CREATE TEMPORARY TABLE allsoftware as +SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro join software ps on ps.id=ro.id where cast(ps.year as int)>2003 - group by ro.organization), - allpubsshare as ( - select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs - join pubs_oa on allpubs.organization=pubs_oa.organization), - alldatasetssshare as ( - select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d + group by ro.organization; + +CREATE TEMPORARY TABLE allpubsshare as +select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs + join pubs_oa on allpubs.organization=pubs_oa.organization; + +CREATE TEMPORARY TABLE alldatasetssshare as +select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d from alldatasets - join datasets_oa on alldatasets.organization=datasets_oa.organization), - allsoftwaresshare as ( - select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s + join datasets_oa on alldatasets.organization=datasets_oa.organization; + +CREATE TEMPORARY TABLE allsoftwaresshare as +select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s from allsoftware - join software_oa on allsoftware.organization=software_oa.organization) + join software_oa on allsoftware.organization=software_oa.organization; + +create table if not exists indi_org_openess stored as parquet as select allpubsshare.organization, - (p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end) + (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) +(case when d is null then 0 else 1 end)) org_openess FROM allpubsshare left outer join (select organization,d from @@ -785,55 +610,75 @@ select allpubsshare.organization, allsoftwaresshare) tmp2 on tmp2.organization=allpubsshare.organization; -compute stats indi_org_openess; +DROP TABLE pubs_oa purge; +DROP TABLE datasets_oa purge; +DROP TABLE software_oa purge; +DROP TABLE allpubs purge; +DROP TABLE alldatasets purge; +DROP TABLE allsoftware purge; +DROP TABLE allpubsshare purge; +DROP TABLE alldatasetssshare purge; +DROP TABLE allsoftwaresshare purge; -create table indi_org_openess_year stored as parquet as - WITH pubs_oa as ( - SELECT r.year, ro.organization, count(distinct r.id) no_oapubs FROM publication r +ANALYZE TABLE indi_org_openess COMPUTE STATISTICS; + +CREATE TEMPORARY TABLE pubs_oa AS +SELECT r.year, ro.organization, count(distinct r.id) no_oapubs FROM publication r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization,r.year), - datasets_oa as ( - SELECT r.year,ro.organization, count(distinct r.id) no_oadatasets FROM dataset r + group by ro.organization,r.year; + +CREATE TEMPORARY TABLE datasets_oa AS +SELECT r.year,ro.organization, count(distinct r.id) no_oadatasets FROM dataset r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization, r.year), - software_oa as ( - SELECT r.year,ro.organization, count(distinct r.id) no_oasoftware FROM software r + group by ro.organization, r.year; + +CREATE TEMPORARY TABLE software_oa AS +SELECT r.year,ro.organization, count(distinct r.id) no_oasoftware FROM software r join result_organization ro on ro.id=r.id join result_instance ri on ri.id=r.id where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source') and cast(r.year as int)>2003 - group by ro.organization, r.year), - allpubs as ( - SELECT p.year,ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro + group by ro.organization, r.year; + +CREATE TEMPORARY TABLE allpubs as +SELECT p.year,ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro join publication p on p.id=ro.id where cast(p.year as int)>2003 - group by ro.organization, p.year), - alldatasets as ( - SELECT d.year, ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro + group by ro.organization, p.year; + +CREATE TEMPORARY TABLE alldatasets as +SELECT d.year, ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro join dataset d on d.id=ro.id where cast(d.year as int)>2003 - group by ro.organization, d.year), - allsoftware as ( - SELECT s.year,ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro + group by ro.organization, d.year; + +CREATE TEMPORARY TABLE allsoftware as +SELECT s.year,ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro join software s on s.id=ro.id where cast(s.year as int)>2003 - group by ro.organization, s.year), - allpubsshare as ( - select allpubs.year, pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs - join pubs_oa on allpubs.organization=pubs_oa.organization where cast(allpubs.year as INT)=cast(pubs_oa.year as int)), - alldatasetssshare as ( - select alldatasets.year, datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d + group by ro.organization, s.year; + +CREATE TEMPORARY TABLE allpubsshare as +select allpubs.year, pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs + join pubs_oa on allpubs.organization=pubs_oa.organization where cast(allpubs.year as INT)=cast(pubs_oa.year as int); + +CREATE TEMPORARY TABLE alldatasetssshare as +select alldatasets.year, datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d from alldatasets - join datasets_oa on alldatasets.organization=datasets_oa.organization where cast(alldatasets.year as INT)=cast(datasets_oa.year as int)), - allsoftwaresshare as ( - select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s + join datasets_oa on alldatasets.organization=datasets_oa.organization where cast(alldatasets.year as INT)=cast(datasets_oa.year as int); + +CREATE TEMPORARY TABLE allsoftwaresshare as +select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s from allsoftware - join software_oa on allsoftware.organization=software_oa.organization where cast(allsoftware.year as INT)=cast(software_oa.year as int)) + join software_oa on allsoftware.organization=software_oa.organization where cast(allsoftware.year as INT)=cast(software_oa.year as int); + + +create table if not exists indi_org_openess_year stored as parquet as select allpubsshare.year, allpubsshare.organization, - (p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end) + (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) +(case when d is null then 0 else 1 end)) org_openess FROM allpubsshare left outer join (select year, organization,d from @@ -843,9 +688,19 @@ select allpubsshare.year, allpubsshare.organization, allsoftwaresshare) tmp2 on tmp2.organization=allpubsshare.organization and tmp2.year=allpubsshare.year; -compute stats indi_org_openess_year; +DROP TABLE pubs_oa purge; +DROP TABLE datasets_oa purge; +DROP TABLE software_oa purge; +DROP TABLE allpubs purge; +DROP TABLE alldatasets purge; +DROP TABLE allsoftware purge; +DROP TABLE allpubsshare purge; +DROP TABLE alldatasetssshare purge; +DROP TABLE allsoftwaresshare purge; -create table indi_pub_has_preprint stored as parquet as +ANALYZE TABLE indi_org_openess_year COMPUTE STATISTICS; + +create table if not exists indi_pub_has_preprint stored as parquet as select distinct p.id, coalesce(has_preprint, 0) as has_preprint from publication_classifications p left outer join ( @@ -854,9 +709,9 @@ from publication_classifications p where p.type='Preprint') tmp on p.id= tmp.id; -compute stats indi_pub_has_preprint; +ANALYZE TABLE indi_pub_has_preprint COMPUTE STATISTICS; -create table indi_pub_in_subscribed stored as parquet as +create table if not exists indi_pub_in_subscribed stored as parquet as select distinct p.id, coalesce(is_subscription, 0) as is_subscription from publication p left outer join( @@ -867,9 +722,9 @@ from publication p where g.is_gold=0 and h.is_hybrid=0 and t.is_transformative=0) tmp on p.id=tmp.id; -compute stats indi_pub_in_subscribed; +ANALYZE TABLE indi_pub_in_subscribed COMPUTE STATISTICS; -create table indi_result_with_pid as +create table if not exists indi_result_with_pid as select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid from result p left outer join ( @@ -877,4 +732,4 @@ from result p from result_pids p) tmp on p.id= tmp.id; -compute stats indi_result_with_pid; \ No newline at end of file +ANALYZE TABLE indi_result_with_pid COMPUTE STATISTICS; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 237f68fae1..bc72b6c15d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -60,87 +60,92 @@ create table TARGET.result stored as parquet as 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan - 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork - 'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University - 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech - 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e' -- University of Cape Town - ))) foo; -compute stats TARGET.result; + 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork + 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University + 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech + 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town + 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin + 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology + 'openorgs____::846cb428d3f52a445f7275561a7beb5d' -- University of Manitoba + ) )) foo; + +ANALYZE TABLE TARGET.result COMPUTE STATISTICS; create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_citations; +ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS; create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_references_oc; +ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS; create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_citations_oc; +ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS; create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_classifications; +ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS; create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_apc; +ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS; create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_concepts; +ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS; create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_datasources; +ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS; create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_fundercount; +ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS; create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_gold; +ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS; create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_greenoa; +ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS; create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_languages; +ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS; create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_licenses; +ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS; create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized; +ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS; create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_oids; +ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS; create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_organization; +ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS; create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_peerreviewed; +ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS; create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_pids; +ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS; create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_projectcount; +ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS; create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_projects; +ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS; create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_refereed; +ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS; create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_sources; +ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS; create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_topics; +ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS; create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_fos; +ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS; create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; drop view TARGET.foo1; drop view TARGET.foo2; -compute stats TARGET.result_result; +ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS; -- datasources create view if not exists TARGET.datasource as select * from SOURCE.datasource; @@ -149,7 +154,7 @@ create view if not exists TARGET.datasource_organizations as select * from SOURC create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources; create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources; -compute stats TARGET.datasource_results; +ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS; -- organizations create view if not exists TARGET.organization as select * from SOURCE.organization; @@ -164,30 +169,31 @@ create view if not exists TARGET.project_oids as select * from SOURCE.project_oi create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations; create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount; create view if not exists TARGET.project_classification as select * from SOURCE.project_classification; +create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution; create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; -compute stats TARGET.project_results; +ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS; -- indicators -- Sprint 1 ---- create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_green_oa; +ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS; create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_grey_lit; +ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS; create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_doi_from_crossref; +ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS; -- Sprint 2 ---- create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_has_cc_licence; +ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS; create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_has_cc_licence_url; +ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS; create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_has_abstract; +ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS; create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_with_orcid; +ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS; ---- Sprint 3 ---- create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_funded_result_with_fundref; +ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS; create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab; create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab; create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org; @@ -196,30 +202,30 @@ create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funde create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab; ---- Sprint 4 ---- create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_diamond; +ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS; create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_in_transformative; +ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS; create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_closed_other_open; +ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS; ---- Sprint 5 ---- create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_result_no_of_copies; +ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS; ---- Sprint 6 ---- create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_hybrid_oa_with_cc; +ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -compute stats TARGET.indi_pub_downloads; +ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS; create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -compute stats TARGET.indi_pub_downloads_datasource; +ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS; create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -compute stats TARGET.indi_pub_downloads_year; +ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS; create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); -compute stats TARGET.indi_pub_downloads_datasource_year; +ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS; ---- Sprint 7 ---- create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_gold_oa; +ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS; create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_pub_hybrid; +ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS; create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness; create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr; create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year; @@ -230,11 +236,8 @@ create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable; create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess; create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year; create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS; create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id); +ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS; create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id); - ---create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---compute stats TARGET.indi_datasets_gold_oa; ---create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); ---compute stats TARGET.indi_software_gold_oa; - +ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql index e24370e7d3..2d7d572b38 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step21-createObservatoryDB.sql @@ -8,6 +8,8 @@ from ${stats_db_name}.result r group by rl.id ) rln on rln.id=r.id; +ANALYZE TABLE ${observatory_db_name}.result_cc_licence COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_country stored as parquet as select count(distinct r.id) as total, @@ -37,6 +39,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_year stored as parquet as select count(distinct r.id) as total, @@ -66,6 +70,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_year COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as select count(distinct r.id) as total, @@ -95,6 +101,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_year_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as select count(distinct r.id) as total, @@ -126,6 +134,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as select count(distinct r.id) as total, @@ -157,6 +167,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_datasource_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_organization stored as parquet as select count(distinct r.id) as total, @@ -186,6 +198,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as select count(distinct r.id) as total, @@ -215,6 +229,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_organization_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_funder stored as parquet as select count(distinct r.id) as total, @@ -246,6 +262,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder COMPUTE STATISTICS; + create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as select count(distinct r.id) as total, @@ -277,6 +295,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_affiliated_funder_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_country stored as parquet as select count(distinct r.id) as total, @@ -308,6 +328,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_year stored as parquet as select count(distinct r.id) as total, @@ -339,6 +361,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; +ANALYZE TABLE ${observatory_db_name}.result_deposited_year COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_year_country stored as parquet as select count(distinct r.id) as total, @@ -370,6 +394,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_year_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_datasource stored as parquet as select count(distinct r.id) as total, @@ -401,6 +427,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as select count(distinct r.id) as total, @@ -432,6 +460,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_datasource_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_organization stored as parquet as select count(distinct r.id) as total, @@ -463,6 +493,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_organization COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as select count(distinct r.id) as total, @@ -494,6 +526,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; +ANALYZE TABLE ${observatory_db_name}.result_deposited_organization_country COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_funder stored as parquet as select count(distinct r.id) as total, @@ -527,6 +561,8 @@ group by r.green, r.gold, case when rl.type is not null then true else false end case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; +ANALYZE TABLE ${observatory_db_name}.result_deposited_funder COMPUTE STATISTICS; + create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as select count(distinct r.id) as total, @@ -558,4 +594,6 @@ from ${stats_db_name}.result r left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end, case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract, - cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; \ No newline at end of file + cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; + +ANALYZE TABLE ${observatory_db_name}.result_deposited_funder_country COMPUTE STATISTICS; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index c31180c141..e0522e1495 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -84,4 +84,12 @@ create table ${stats_db_name}.funder STORED AS PARQUET as select distinct xpath_string(fund, '//funder/id') as id, xpath_string(fund, '//funder/name') as name, xpath_string(fund, '//funder/shortname') as shortname -from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; \ No newline at end of file +from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; + +CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS +SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization, +properties[0].value contribution, properties[1].value currency +from ${openaire_db_name}.relation r +LATERAL VIEW explode (r.properties) properties +where properties[0].key='contribution' and r.reltype = 'projectOrganization' and r.source like '40|%' +and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 01bed17cc1..248716b369 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -48,12 +48,10 @@ WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false; -- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table. -- Creating a temporary dual table that will be removed after the following insert -CREATE TABLE ${stats_db_name}.dual -( - dummy CHAR(1) -); -INSERT INTO ${stats_db_name}.dual -VALUES ('X'); +CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1)); + +INSERT INTO ${stats_db_name}.dual VALUES ('X'); + INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`) SELECT 'other', @@ -73,12 +71,8 @@ FROM ${stats_db_name}.dual WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); DROP TABLE ${stats_db_name}.dual; -UPDATE ${stats_db_name}.datasource_tmp -SET name='Other' -WHERE name = 'Unknown Repository'; -UPDATE ${stats_db_name}.datasource_tmp -SET yearofvalidation=null -WHERE yearofvalidation = '-1'; +UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository'; +UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1'; CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS SELECT substr(d.id, 4) AS id, langs.languages AS language @@ -104,4 +98,4 @@ where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS SELECT datasource AS id, id AS result -FROM ${stats_db_name}.result_datasources; \ No newline at end of file +FROM ${stats_db_name}.result_datasources; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index c68ae46cac..8d2e56380c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + stats_db_name @@ -10,7 +10,6 @@ external_stats_db_name - stats_ext the external stats that should be added since they are not included in the graph database @@ -53,6 +52,10 @@ context_api_url the base url of the context api (https://services.openaire.eu/openaire) + + hadoop_user_name + user name of the wf owner + @@ -67,11 +70,14 @@ hive.txn.timeout ${hive_timeout} + + mapred.job.queue.name + analytics + - ${wf:conf('resumeFrom') eq 'Step1'} @@ -99,12 +105,13 @@ ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-pre'} ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB'} ${wf:conf('resumeFrom') eq 'step21-createObservatoryDB-post'} - ${wf:conf('resumeFrom') eq 'Step22'} + ${wf:conf('resumeFrom') eq 'step22-copyDataToImpalaCluster'} + ${wf:conf('resumeFrom') eq 'step23-finalizeImpalaCluster'} + ${wf:conf('resumeFrom') eq 'Step24-updateCache'} - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -282,6 +289,7 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} + external_stats_db_name=${external_stats_db_name} @@ -306,6 +314,7 @@ ${nameNode} indicators.sh ${stats_db_name} + ${external_stats_db_name} ${wf:appPath()}/scripts/step16-createIndicatorsTables.sql indicators.sh @@ -334,7 +343,7 @@ - + ${jobTracker} @@ -359,6 +368,19 @@ ${wf:appPath()}/scripts/step20-createMonitorDB.sql monitor.sh + + + + + + + ${jobTracker} + ${nameNode} + monitor-post.sh + ${monitor_db_name} + ${monitor_db_shadow_name} + monitor-post.sh + @@ -393,16 +415,50 @@ ${jobTracker} ${nameNode} observatory-post.sh - ${stats_db_name} ${observatory_db_name} ${observatory_db_shadow_name} observatory-post.sh - + - + + + ${jobTracker} + ${nameNode} + copyDataToImpalaCluster.sh + + + ${stats_db_name} + ${monitor_db_name} + ${observatory_db_name} + ${external_stats_db_name} + ${hadoop_user_name} + copyDataToImpalaCluster.sh + + + + + + + + ${jobTracker} + ${nameNode} + finalizeImpalaCluster.sh + ${stats_db_name} + ${stats_db_shadow_name} + ${monitor_db_name} + ${monitor_db_shadow_name} + ${observatory_db_name} + ${observatory_db_shadow_name} + finalizeImpalaCluster.sh + + + + + + ${jobTracker} ${nameNode} @@ -415,4 +471,4 @@ - \ No newline at end of file +