Merge branch 'beta' into fulltext_url_validation

This commit is contained in:
Claudio Atzori 2023-06-12 09:55:25 +02:00
commit 4b00a76271
21 changed files with 2081 additions and 133 deletions

View File

@ -642,12 +642,12 @@
"PANGAEA.REPOSITORY": { "PANGAEA.REPOSITORY": {
"openaire_id": "re3data_____::r3d100010134", "openaire_id": "re3data_____::r3d100010134",
"datacite_name": "PANGAEA", "datacite_name": "PANGAEA",
"official_name": "PANGAEA" "official_name": "PANGAEA - Data Publisher for Earth and Environmental Science"
}, },
"TIB.PANGAEA": { "TIB.PANGAEA": {
"openaire_id": "re3data_____::r3d100010134", "openaire_id": "re3data_____::r3d100010134",
"datacite_name": "PANGAEA", "datacite_name": "PANGAEA",
"official_name": "PANGAEA" "official_name": "PANGAEA - Data Publisher for Earth and Environmental Science"
}, },
"NASAPDS.NASAPDS": { "NASAPDS.NASAPDS": {
"openaire_id": "re3data_____::r3d100010121", "openaire_id": "re3data_____::r3d100010121",

View File

@ -15,6 +15,7 @@ public class Community implements Serializable {
private List<Provider> providers = new ArrayList<>(); private List<Provider> providers = new ArrayList<>();
private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>(); private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>();
private SelectionConstraints constraints = new SelectionConstraints(); private SelectionConstraints constraints = new SelectionConstraints();
private SelectionConstraints removeConstraints = new SelectionConstraints();
public String toJson() { public String toJson() {
final Gson g = new Gson(); final Gson g = new Gson();
@ -67,4 +68,12 @@ public class Community implements Serializable {
public void setConstraints(SelectionConstraints constraints) { public void setConstraints(SelectionConstraints constraints) {
this.constraints = constraints; this.constraints = constraints;
} }
public SelectionConstraints getRemoveConstraints() {
return removeConstraints;
}
public void setRemoveConstraints(SelectionConstraints removeConstraints) {
this.removeConstraints = removeConstraints;
}
} }

View File

@ -28,6 +28,8 @@ public class CommunityConfiguration implements Serializable {
private Map<String, SelectionConstraints> selectionConstraintsMap = new HashMap<>(); private Map<String, SelectionConstraints> selectionConstraintsMap = new HashMap<>();
// map eosc datasource -> communityid // map eosc datasource -> communityid
private Map<String, List<Pair<String, SelectionConstraints>>> eoscDatasourceMap = new HashMap<>(); private Map<String, List<Pair<String, SelectionConstraints>>> eoscDatasourceMap = new HashMap<>();
// map communityid -> remove constraints
private Map<String, SelectionConstraints> removeConstraintsMap = new HashMap<>();
public Map<String, List<Pair<String, SelectionConstraints>>> getEoscDatasourceMap() { public Map<String, List<Pair<String, SelectionConstraints>>> getEoscDatasourceMap() {
return eoscDatasourceMap; return eoscDatasourceMap;
@ -71,6 +73,14 @@ public class CommunityConfiguration implements Serializable {
this.selectionConstraintsMap = selectionConstraintsMap; this.selectionConstraintsMap = selectionConstraintsMap;
} }
public Map<String, SelectionConstraints> getRemoveConstraintsMap() {
return removeConstraintsMap;
}
public void setRemoveConstraintsMap(Map<String, SelectionConstraints> removeConstraintsMap) {
this.removeConstraintsMap = removeConstraintsMap;
}
CommunityConfiguration(final Map<String, Community> communities) { CommunityConfiguration(final Map<String, Community> communities) {
this.communities = communities; this.communities = communities;
init(); init();
@ -90,6 +100,9 @@ public class CommunityConfiguration implements Serializable {
if (selectionConstraintsMap == null) { if (selectionConstraintsMap == null) {
selectionConstraintsMap = Maps.newHashMap(); selectionConstraintsMap = Maps.newHashMap();
} }
if (removeConstraintsMap == null) {
removeConstraintsMap = Maps.newHashMap();
}
for (Community c : getCommunities().values()) { for (Community c : getCommunities().values()) {
// get subjects // get subjects
@ -111,6 +124,8 @@ public class CommunityConfiguration implements Serializable {
zenodocommunityMap); zenodocommunityMap);
} }
selectionConstraintsMap.put(id, c.getConstraints()); selectionConstraintsMap.put(id, c.getConstraints());
removeConstraintsMap.put(id, c.getRemoveConstraints());
} }
} }

View File

@ -86,6 +86,7 @@ public class CommunityConfigurationFactory {
c.setProviders(parseDatasources(node)); c.setProviders(parseDatasources(node));
c.setZenodoCommunities(parseZenodoCommunities(node)); c.setZenodoCommunities(parseZenodoCommunities(node));
c.setConstraints(parseConstrains(node)); c.setConstraints(parseConstrains(node));
c.setRemoveConstraints(parseRemoveConstrains(node));
return c; return c;
} }
@ -102,6 +103,19 @@ public class CommunityConfigurationFactory {
return selectionConstraints; return selectionConstraints;
} }
private static SelectionConstraints parseRemoveConstrains(Node node) {
Node constsNode = node.selectSingleNode("./removeConstraints");
if (constsNode == null || StringUtils.isBlank(StringUtils.trim(constsNode.getText()))) {
return new SelectionConstraints();
}
SelectionConstraints selectionConstraints = new Gson()
.fromJson(constsNode.getText(), SelectionConstraints.class);
selectionConstraints.setSelection(resolver);
log.info("number of selection constraints set " + selectionConstraints.getCriteria().size());
return selectionConstraints;
}
private static List<String> parseSubjects(final Node node) { private static List<String> parseSubjects(final Node node) {
final List<String> subjects = Lists.newArrayList(); final List<String> subjects = Lists.newArrayList();

View File

@ -79,6 +79,23 @@ public class ResultTagger implements Serializable {
break; break;
} }
// communities contains all the communities to be not added to the context
final Set<String> removeCommunities = new HashSet<>();
conf
.getRemoveConstraintsMap()
.keySet()
.forEach(communityId -> {
if (conf.getRemoveConstraintsMap().get(communityId).getCriteria() != null &&
conf
.getRemoveConstraintsMap()
.get(communityId)
.getCriteria()
.stream()
.anyMatch(crit -> crit.verifyCriteria(param)))
removeCommunities.add(communityId);
});
// communities contains all the communities to be added as context for the result // communities contains all the communities to be added as context for the result
final Set<String> communities = new HashSet<>(); final Set<String> communities = new HashSet<>();
@ -164,7 +181,8 @@ public class ResultTagger implements Serializable {
.getSelectionConstraintsMap() .getSelectionConstraintsMap()
.keySet() .keySet()
.forEach(communityId -> { .forEach(communityId -> {
if (conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null && if (!removeCommunities.contains(communityId) &&
conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null &&
conf conf
.getSelectionConstraintsMap() .getSelectionConstraintsMap()
.get(communityId) .get(communityId)
@ -175,6 +193,9 @@ public class ResultTagger implements Serializable {
}); });
communities.addAll(aconstraints); communities.addAll(aconstraints);
communities.removeAll(removeCommunities);
if (aconstraints.size() > 0) if (aconstraints.size() > 0)
log.info("Found {} for advancedConstraints ", aconstraints.size()); log.info("Found {} for advancedConstraints ", aconstraints.size());

View File

@ -10,6 +10,9 @@ where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] and $x//con
return return
<community> <community>
{ $x//CONFIGURATION/context/@id} { $x//CONFIGURATION/context/@id}
<removeConstraints>
{$x//CONFIGURATION/context/param[./@name='removeConstraints']/text() }
</removeConstraints>
<advancedConstraints> <advancedConstraints>
{$x//CONFIGURATION/context/param[./@name='advancedConstraints']/text() } {$x//CONFIGURATION/context/param[./@name='advancedConstraints']/text() }
</advancedConstraints> </advancedConstraints>

View File

@ -39,8 +39,10 @@ public class BulkTagJobTest {
+ " \"contributor\" : \"$['contributor'][*]['value']\"," + " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\", " + " \"description\" : \"$['description'][*]['value']\", "
+ " \"subject\" :\"$['subject'][*]['value']\" , " + + " \"subject\" :\"$['subject'][*]['value']\" , " +
"\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='FOS')].value\"" + "\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='FOS')].value\"," +
"} "; "\"sdg\" : \"$['subject'][?(@['qualifier']['classid']=='SDG')].value\"," +
"\"hostedby\" : \"$['instance'][*]['hostedby']['key']\" , " +
"\"collectedfrom\" : \"$['instance'][*]['collectedfrom']['key']\"} ";
private static SparkSession spark; private static SparkSession spark;
@ -56,7 +58,7 @@ public class BulkTagJobTest {
.toString( .toString(
BulkTagJobTest.class BulkTagJobTest.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf_dth.xml")); "/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf_remove.xml"));
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }
@ -1525,4 +1527,45 @@ public class BulkTagJobTest {
.count()); .count());
} }
@Test
void removeTest() throws Exception {
final String pathMap = BulkTagJobTest.pathMap;
SparkBulkTagJob
.main(
new String[] {
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath",
getClass()
.getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints")
.getPath(),
"-taggingConf", taggingConf,
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath", workingDir.toString() + "/dataset",
"-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
"-pathMap", pathMap
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(12, tmp.count());
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
verificationDataset.createOrReplaceTempView("dataset");
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ "from dataset "
+ "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyD.inferenceprovenance = 'bulktagging'";
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
Assertions.assertEquals(3, idExplodeCommunity.filter("community = 'dth'").count());
}
} }

View File

@ -21,7 +21,7 @@
</property> </property>
<property> <property>
<name>hive_jdbc_url</name> <name>hive_jdbc_url</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=19166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=11596411699;spark.yarn.driver.memoryOverhead=1228</value> <value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228</value>
</property> </property>
<property> <property>
<name>oozie.wf.workflow.notification.url</name> <name>oozie.wf.workflow.notification.url</name>

View File

@ -68,6 +68,16 @@ copydb $USAGE_STATS_DB
copydb $PROD_USAGE_STATS_DB copydb $PROD_USAGE_STATS_DB
copydb $EXT_DB copydb $EXT_DB
copydb $STATS_DB copydb $STATS_DB
copydb $MONITOR_DB #copydb $MONITOR_DB
copydb $OBSERVATORY_DB copydb $OBSERVATORY_DB
copydb $MONITOR_DB'_funded'
copydb $MONITOR_DB'_institutions'
copydb $MONITOR_DB'_RIs_tail'
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
for i in ${contexts}
do
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
copydb ${MONITOR_DB}'_'${tmp}
done

View File

@ -29,3 +29,14 @@ createShadowDB $STATS_DB $STATS_DB_SHADOW
createShadowDB $MONITOR_DB $MONITOR_DB_SHADOW createShadowDB $MONITOR_DB $MONITOR_DB_SHADOW
createShadowDB $OBSERVATORY_DB $OBSERVATORY_DB_SHADOW createShadowDB $OBSERVATORY_DB $OBSERVATORY_DB_SHADOW
createShadowDB USAGE_STATS_DB USAGE_STATS_DB_SHADOW createShadowDB USAGE_STATS_DB USAGE_STATS_DB_SHADOW
createShadowDB $MONITOR_DB'_funded' $MONITOR_DB'_funded_shadow'
createShadowDB $MONITOR_DB'_institutions' $MONITOR_DB'_institutions_shadow'
createShadowDB $MONITOR_DB'_RIs_tail' $MONITOR_DB'_RIs_tail_shadow'
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
for i in ${contexts}
do
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
createShadowDB ${MONITOR_DB}'_'${tmp} ${MONITOR_DB}'_'${tmp}'_shadow'
done

View File

@ -10,16 +10,88 @@ export SOURCE=$1
export TARGET=$2 export TARGET=$2
export SHADOW=$3 export SHADOW=$3
export SCRIPT_PATH=$4 export SCRIPT_PATH=$4
export SCRIPT_PATH2=$5
export SCRIPT_PATH3=$6
export SCRIPT_PATH4=$7
export SCRIPT_PATH5=$8
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie" export HADOOP_USER_NAME="oozie"
echo "Getting file from " $SCRIPT_PATH echo "Getting file from " $4
hdfs dfs -copyToLocal $SCRIPT_PATH hdfs dfs -copyToLocal $4
echo "Getting file from " $5
hdfs dfs -copyToLocal $5
echo "Getting file from " $6
hdfs dfs -copyToLocal $6
echo "Getting file from " $7
hdfs dfs -copyToLocal $7
echo "Getting file from " $8
hdfs dfs -copyToLocal $8
echo "Creating monitor database" echo "Creating monitor database"
#cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo cat step20-createMonitorDB_funded.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_funded/g1" > foo
cat step20-createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g1" > foo
hive $HIVE_OPTS -f foo hive $HIVE_OPTS -f foo
cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_funded/g1" > foo
hive $HIVE_OPTS -f foo
#
cat step20-createMonitorDB_institutions.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo
hive $HIVE_OPTS -f foo
cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo
hive $HIVE_OPTS -f foo
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
for i in ${contexts}
do
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
tmp2=`echo "$i" |sed 's/:.*//' `
cat step20-createMonitorDB_RIs.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_$tmp/g1" | sed "s/CONTEXT/\'%$tmp2%\'/g" > foo
hive $HIVE_OPTS -f foo
cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_$tmp/g1" > foo
hive $HIVE_OPTS -f foo
done
cat step20-createMonitorDB_RIs_tail.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_RIs_tail/g1" | sed "s/CONTEXTS/\"'knowmad::other','dh-ch::other', 'enermaps::other', 'gotriple::other', 'neanias-atmospheric::other', 'rural-digital-europe::other', 'covid-19::other', 'aurora::other', 'neanias-space::other', 'north-america-studies::other', 'north-american-studies::other', 'eutopia::other'\"/g" > foo
hive $HIVE_OPTS -f foo
cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_RIs_tail/g1" > foo
hive $HIVE_OPTS -f foo
echo "Hive shell finished" echo "Hive shell finished"
echo "Updating shadow monitor funded database"
hive -e "drop database if exists ${SHADOW}_funded cascade"
hive -e "create database if not exists ${SHADOW}_funded"
hive $HIVE_OPTS --database ${2}_funded -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_funded.\1 as select * from ${2}_funded.\1;/" > foo
hive -f foo
echo "Updated shadow monitor funded database"
echo "Updating shadow monitor insitutions database"
hive -e "drop database if exists ${SHADOW}_institutions cascade"
hive -e "create database if not exists ${SHADOW}_institutions"
hive $HIVE_OPTS --database ${2}_institutions -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_institutions.\1 as select * from ${2}_institutions.\1;/" > foo
hive -f foo
echo "Shadow db monitor insitutions ready!"
echo "Updating shadow monitor RIs database"
for i in $contexts
do
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
hive -e "drop database if exists ${SHADOW}_${tmp} cascade"
hive -e "create database if not exists ${SHADOW}_${tmp}"
hive $HIVE_OPTS --database ${2}_${tmp} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_${tmp}.\1 as select * from ${2}_${tmp}.\1;/" > foo
hive -f foo
done
echo "Shadow db monitor RIs ready!"
echo "Updating shadow monitor RIs tail database"
hive -e "drop database if exists ${SHADOW}_ris_tail cascade"
hive -e "create database if not exists ${SHADOW}_ris_tail"
hive $HIVE_OPTS --database ${2}_ris_tail -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_ris_tail.\1 as select * from ${2}_ris_tail.\1;/" > foo
hive -f foo
echo "Shadow db monitor RIs tail ready!"

View File

@ -46,4 +46,8 @@ FROM (
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
select distinct substr(id,4),id, accessroute from ${openaire_db_name}.result
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute;

View File

@ -92,53 +92,59 @@ ANALYZE TABLE indi_funded_result_with_fundref COMPUTE STATISTICS;
-- --
-- compute stats indi_result_org_collab; -- compute stats indi_result_org_collab;
-- --
create TEMPORARY TABLE tmp AS SELECT ro.organization organization, ro.id from result_organization ro create TEMPORARY TABLE tmp AS SELECT ro.organization organization, ro.id, o.name from result_organization ro
join organization o on o.id=ro.organization where o.name is not null; join organization o on o.id=ro.organization where o.name is not null;
create table if not exists indi_result_org_collab stored as parquet as create table if not exists indi_result_org_collab stored as parquet as
select o1.organization org1, o2.organization org2, count(o1.id) as collaborations select o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name
group by o1.organization, o2.organization; group by o1.organization, o2.organization, o1.name, o2.name;
drop table tmp purge; drop table tmp purge;
ANALYZE TABLE indi_result_org_collab COMPUTE STATISTICS; ANALYZE TABLE indi_result_org_collab COMPUTE STATISTICS;
create TEMPORARY TABLE tmp AS create TEMPORARY TABLE tmp AS
select distinct ro.organization organization, ro.id, o.country from result_organization ro select distinct ro.organization organization, ro.id, o.name, o.country from result_organization ro
join organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null; join organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null;
create table if not exists indi_result_org_country_collab stored as parquet as create table if not exists indi_result_org_country_collab stored as parquet as
select o1.organization org1,o2.country country2, count(o1.id) as collaborations select o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations
from tmp as o1 join tmp as o2 on o1.id=o2.id from tmp as o1 join tmp as o2 on o1.id=o2.id
where o1.id=o2.id and o1.country!=o2.country where o1.id=o2.id and o1.country!=o2.country
group by o1.organization, o1.id, o2.country; group by o1.organization, o1.id, o1.name, o2.country;
drop table tmp purge; drop table tmp purge;
ANALYZE TABLE indi_result_org_country_collab COMPUTE STATISTICS; ANALYZE TABLE indi_result_org_country_collab COMPUTE STATISTICS;
create TEMPORARY TABLE tmp AS
select o.id organization, o.name, ro.project as project from organization o
join organization_projects ro on o.id=ro.id where o.name is not null;
create table if not exists indi_project_collab_org stored as parquet as create table if not exists indi_project_collab_org stored as parquet as
select o1.id org1,o2.id org2, count(distinct o1.project) as collaborations select o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations
from organization_projects as o1 from tmp as o1
join organization_projects as o2 on o1.project=o2.project join tmp as o2 on o1.project=o2.project
where o1.id!=o2.id where o1.organization<>o2.organization and o1.name<>o2.name
group by o1.id, o2.id; group by o1.name,o2.name, o1.organization, o2.organization;
drop table tmp purge;
ANALYZE TABLE indi_project_collab_org COMPUTE STATISTICS; ANALYZE TABLE indi_project_collab_org COMPUTE STATISTICS;
create TEMPORARY TABLE tmp AS create TEMPORARY TABLE tmp AS
select o.id organization, o.country , ro.project as project from organization o select o.id organization, o.name, o.country , ro.project as project from organization o
join organization_projects ro on o.id=ro.id join organization_projects ro on o.id=ro.id
and o.country <> 'UNKNOWN'; and o.country <> 'UNKNOWN' and o.name is not null;
create table if not exists indi_project_collab_org_country stored as parquet as create table if not exists indi_project_collab_org_country stored as parquet as
select o1.organization org1,o2.country country2, count(distinct o1.project) as collaborations select o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 on o1.project=o2.project join tmp as o2 on o1.project=o2.project
where o1.organization<>o2.organization and o1.country<>o2.country where o1.organization<>o2.organization and o1.country<>o2.country
group by o1.organization, o2.country; group by o1.organization, o2.country, o1.name;
drop table tmp purge; drop table tmp purge;
@ -217,38 +223,6 @@ select id, count(id) as number_of_copies from result_instance group by id;
ANALYZE TABLE indi_result_no_of_copies COMPUTE STATISTICS; ANALYZE TABLE indi_result_no_of_copies COMPUTE STATISTICS;
---- Sprint 6 ---- ---- Sprint 6 ----
create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as
WITH hybrid_oa AS (
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
FROM STATS_EXT.plan_s_jn
WHERE issn_print != ""
UNION ALL
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn
FROM STATS_EXT.plan_s_jn
WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)),
issn AS (
SELECT *
FROM (
SELECT id, issn_printed as issn
FROM datasource
WHERE issn_printed IS NOT NULL
UNION ALL
SELECT id,issn_online as issn
FROM datasource
WHERE issn_online IS NOT NULL ) as issn
WHERE LENGTH(issn) > 7)
SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa
FROM publication_datasources pd
LEFT OUTER JOIN (
SELECT pd.id, 1 as is_hybrid_oa from publication_datasources pd
JOIN datasource d on d.id=pd.datasource
JOIN issn on issn.id=pd.datasource
JOIN hybrid_oa ON issn.issn = hybrid_oa.issn
JOIN indi_result_has_cc_licence cc on pd.id=cc.id
where cc.has_cc_license=1) tmp on pd.id=tmp.id;
ANALYZE TABLE indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
create table if not exists indi_pub_downloads stored as parquet as create table if not exists indi_pub_downloads stored as parquet as
SELECT result_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats SELECT result_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats
join publication on result_id=id join publication on result_id=id
@ -335,6 +309,73 @@ FROM
ANALYZE TABLE indi_pub_gold_oa COMPUTE STATISTICS; ANALYZE TABLE indi_pub_gold_oa COMPUTE STATISTICS;
create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as
WITH hybrid_oa AS (
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
FROM STATS_EXT.plan_s_jn
WHERE issn_print != ""
UNION ALL
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn
FROM STATS_EXT.plan_s_jn
WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)),
issn AS (
SELECT *
FROM (
SELECT id, issn_printed as issn
FROM datasource
WHERE issn_printed IS NOT NULL
UNION ALL
SELECT id,issn_online as issn
FROM datasource
WHERE issn_online IS NOT NULL ) as issn
WHERE LENGTH(issn) > 7)
SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa
FROM publication_datasources pd
LEFT OUTER JOIN (
SELECT pd.id, 1 as is_hybrid_oa from publication_datasources pd
JOIN datasource d on d.id=pd.datasource
JOIN issn on issn.id=pd.datasource
JOIN hybrid_oa ON issn.issn = hybrid_oa.issn
JOIN indi_result_has_cc_licence cc on pd.id=cc.id
JOIN indi_pub_gold_oa ga on pd.id=ga.id
where cc.has_cc_license=1 and ga.is_gold=0) tmp on pd.id=tmp.id;
ANALYZE TABLE indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
create table if not exists indi_pub_bronze_oa stored as parquet as
WITH hybrid_oa AS (
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
FROM STATS_EXT.plan_s_jn
WHERE issn_print != ""
UNION ALL
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn
FROM STATS_EXT.plan_s_jn
WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)),
issn AS (
SELECT *
FROM (
SELECT id, issn_printed as issn
FROM datasource
WHERE issn_printed IS NOT NULL
UNION ALL
SELECT id,issn_online as issn
FROM datasource
WHERE issn_online IS NOT NULL ) as issn
WHERE LENGTH(issn) > 7)
SELECT DISTINCT pd.id, coalesce(is_bronze_oa, 0) as is_hybrid_oa
FROM publication_datasources pd
LEFT OUTER JOIN (
SELECT pd.id, 1 as is_bronze_oa from publication_datasources pd
JOIN datasource d on d.id=pd.datasource
JOIN issn on issn.id=pd.datasource
JOIN hybrid_oa ON issn.issn = hybrid_oa.issn
JOIN indi_result_has_cc_licence cc on pd.id=cc.id
JOIN indi_pub_gold_oa ga on pd.id=ga.id
JOIN indi_pub_hybrid_oa_with_cc hy on hy.id=pd.id
where cc.has_cc_license=0 and ga.is_gold=0 and hy.is_hybrid_oa=0) tmp on pd.id=tmp.id;
ANALYZE TABLE indi_pub_bronze_oa COMPUTE STATISTICS;
create table if not exists indi_pub_hybrid stored as parquet as create table if not exists indi_pub_hybrid stored as parquet as
WITH gold_oa AS ( SELECT WITH gold_oa AS ( SELECT
issn_l, issn_l,
@ -733,3 +774,27 @@ from result p
on p.id= tmp.id; on p.id= tmp.id;
ANALYZE TABLE indi_result_with_pid COMPUTE STATISTICS; ANALYZE TABLE indi_result_with_pid COMPUTE STATISTICS;
create table if not exists indi_impact_measures as
select distinct substr(id, 4), measures_ids.id impactmetric, measures_ids.unit.value[0] score,
cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] class
from result lateral view explode(measures) measures as measures_ids
where measures_ids.id!='views' and measures_ids.id!='downloads';
ANALYZE TABLE indi_impact_measures COMPUTE STATISTICS;
CREATE TEMPORARY TABLE pub_fos_totals as
select rf.id, count(distinct lvl3) totals from result_fos rf
group by rf.id;
create table if not exists indi_pub_interdisciplinarity as
select distinct p.id, coalesce(indi_pub_is_interdisciplinary, 0)
as indi_pub_is_interdisciplinary
from pub_fos_totals p
left outer join (
select pub_fos_totals.id, 1 as indi_pub_is_interdisciplinary from pub_fos_totals
where totals>10) tmp on p.id=tmp.id;
drop table pub_fos_totals purge;
ANALYZE TABLE indi_pub_interdisciplinarity COMPUTE STATISTICS;

View File

@ -1,5 +1,78 @@
drop database if exists TARGET cascade; --drop database if exists TARGET cascade;
create database if not exists TARGET; --create database if not exists TARGET;
--
--create view if not exists TARGET.category as select * from SOURCE.category;
--create view if not exists TARGET.concept as select * from SOURCE.concept;
--create view if not exists TARGET.context as select * from SOURCE.context;
--create view if not exists TARGET.country as select * from SOURCE.country;
--create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
--create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
--create view if not exists TARGET.funder as select * from SOURCE.funder;
--create view if not exists TARGET.fundref as select * from SOURCE.fundref;
--create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
--create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
--create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
--create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
--create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
--create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
--
--create table TARGET.result stored as parquet as
-- select distinct * from (
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
-- union all
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
-- union all
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
-- 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
-- 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
-- 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
-- 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
-- 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
-- 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
-- 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
-- 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
-- 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
-- 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
-- -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
-- 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
-- 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
-- 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
-- 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
-- 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
-- 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
-- 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
-- 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
-- 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
-- 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
-- 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
-- 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
-- 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
-- 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
-- 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
-- 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
-- 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
-- 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
-- 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
-- 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
-- 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
-- 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
-- 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
-- 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
-- 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
-- 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
-- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
-- 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
-- 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
-- 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
-- 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
-- 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
-- 'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba
-- 'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili
-- 'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University
-- 'openorgs____::3cff625a4370d51e08624cc586138b2f' -- IMT Atlantique
-- ) )) foo;
--
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
create view if not exists TARGET.category as select * from SOURCE.category; create view if not exists TARGET.category as select * from SOURCE.category;
create view if not exists TARGET.concept as select * from SOURCE.concept; create view if not exists TARGET.concept as select * from SOURCE.concept;
@ -16,61 +89,6 @@ create view if not exists TARGET.totalresearchers as select * from SOURCE.totalr
create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft; create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
create table TARGET.result stored as parquet as
select distinct * from (
select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
union all
select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
union all
select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
-- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
'openorgs____::846cb428d3f52a445f7275561a7beb5d' -- University of Manitoba
) )) foo;
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS; ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS;
@ -140,6 +158,9 @@ ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS;
create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS; ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS;
create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.result_accessroute COMPUTE STATISTICS;
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
@ -213,6 +234,8 @@ ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS;
---- Sprint 6 ---- ---- Sprint 6 ----
create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.indi_pub_bronze_oa COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS; ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
@ -241,3 +264,7 @@ create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SO
ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS; ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS;
create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS; ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS;
create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.indi_impact_measures COMPUTE STATISTICS;
create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id);
ANALYZE TABLE TARGET.indi_pub_interdisciplinarity COMPUTE STATISTICS;

View File

@ -0,0 +1,15 @@
drop database if exists TARGET cascade;
create database if not exists TARGET;
create table TARGET.result stored as parquet as
select distinct * from (
select * from SOURCE.result r where exists
(select 1
from SOURCE.result_concepts rc
join SOURCE.concept conc on conc.id=rc.concept
join SOURCE.category cat on cat.id=conc.category
join SOURCE.context cont on cont.id=cat.context
-- join SOURCE.result
where rc.id=r.id and conc.category like CONTEXT)
) foo;
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;

View File

@ -0,0 +1,15 @@
drop database if exists TARGET cascade;
create database if not exists TARGET;
create table TARGET.result stored as parquet as
select distinct * from (
select * from SOURCE.result r where exists
(select 1
from SOURCE.result_concepts rc
join SOURCE.concept conc on conc.id=rc.concept
join SOURCE.category cat on cat.id=conc.category
join SOURCE.context cont on cont.id=cat.context
-- join SOURCE.result
where rc.id=r.id and conc.category not in (CONTEXTS))
) foo;
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;

View File

@ -0,0 +1,9 @@
drop database if exists TARGET cascade;
create database if not exists TARGET;
create table TARGET.result stored as parquet as
select distinct * from (
select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
) foo;
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;

View File

@ -0,0 +1,56 @@
drop database if exists TARGET cascade;
create database if not exists TARGET;
create table TARGET.result stored as parquet as
select distinct * from (
select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
-- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba
'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili
'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University
'openorgs____::3cff625a4370d51e08624cc586138b2f' -- IMT Atlantique
))) foo;
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;

View File

@ -374,25 +374,29 @@
<argument>${monitor_db_name}</argument> <argument>${monitor_db_name}</argument>
<argument>${monitor_db_shadow_name}</argument> <argument>${monitor_db_shadow_name}</argument>
<argument>${wf:appPath()}/scripts/step20-createMonitorDB.sql</argument> <argument>${wf:appPath()}/scripts/step20-createMonitorDB.sql</argument>
<argument>${wf:appPath()}/scripts/step20-createMonitorDB_funded.sql</argument>
<argument>${wf:appPath()}/scripts/step20-createMonitorDB_institutions.sql</argument>
<argument>${wf:appPath()}/scripts/step20-createMonitorDB_RIs.sql</argument>
<argument>${wf:appPath()}/scripts/step20-createMonitorDB_RIs_tail.sql</argument>
<file>monitor.sh</file> <file>monitor.sh</file>
</shell> </shell>
<ok to="step20-createMonitorDB-post"/>
<error to="Kill"/>
</action>
<action name="step20-createMonitorDB-post">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>monitor-post.sh</exec>
<argument>${monitor_db_name}</argument>
<argument>${monitor_db_shadow_name}</argument>
<file>monitor-post.sh</file>
</shell>
<ok to="step21-createObservatoryDB-pre"/> <ok to="step21-createObservatoryDB-pre"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<!-- <action name="step20-createMonitorDB-post">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>monitor-post.sh</exec>-->
<!-- <argument>${monitor_db_name}</argument>-->
<!-- <argument>${monitor_db_shadow_name}</argument>-->
<!-- <file>monitor-post.sh</file>-->
<!-- </shell>-->
<!-- <ok to="step21-createObservatoryDB-pre"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<action name="step21-createObservatoryDB-pre"> <action name="step21-createObservatoryDB-pre">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>