forked from D-Net/dnet-hadoop
Merge branch 'beta' into fulltext_url_validation
This commit is contained in:
commit
4b00a76271
|
@ -642,12 +642,12 @@
|
|||
"PANGAEA.REPOSITORY": {
|
||||
"openaire_id": "re3data_____::r3d100010134",
|
||||
"datacite_name": "PANGAEA",
|
||||
"official_name": "PANGAEA"
|
||||
"official_name": "PANGAEA - Data Publisher for Earth and Environmental Science"
|
||||
},
|
||||
"TIB.PANGAEA": {
|
||||
"openaire_id": "re3data_____::r3d100010134",
|
||||
"datacite_name": "PANGAEA",
|
||||
"official_name": "PANGAEA"
|
||||
"official_name": "PANGAEA - Data Publisher for Earth and Environmental Science"
|
||||
},
|
||||
"NASAPDS.NASAPDS": {
|
||||
"openaire_id": "re3data_____::r3d100010121",
|
||||
|
|
|
@ -15,6 +15,7 @@ public class Community implements Serializable {
|
|||
private List<Provider> providers = new ArrayList<>();
|
||||
private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>();
|
||||
private SelectionConstraints constraints = new SelectionConstraints();
|
||||
private SelectionConstraints removeConstraints = new SelectionConstraints();
|
||||
|
||||
public String toJson() {
|
||||
final Gson g = new Gson();
|
||||
|
@ -67,4 +68,12 @@ public class Community implements Serializable {
|
|||
public void setConstraints(SelectionConstraints constraints) {
|
||||
this.constraints = constraints;
|
||||
}
|
||||
|
||||
public SelectionConstraints getRemoveConstraints() {
|
||||
return removeConstraints;
|
||||
}
|
||||
|
||||
public void setRemoveConstraints(SelectionConstraints removeConstraints) {
|
||||
this.removeConstraints = removeConstraints;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,6 +28,8 @@ public class CommunityConfiguration implements Serializable {
|
|||
private Map<String, SelectionConstraints> selectionConstraintsMap = new HashMap<>();
|
||||
// map eosc datasource -> communityid
|
||||
private Map<String, List<Pair<String, SelectionConstraints>>> eoscDatasourceMap = new HashMap<>();
|
||||
// map communityid -> remove constraints
|
||||
private Map<String, SelectionConstraints> removeConstraintsMap = new HashMap<>();
|
||||
|
||||
public Map<String, List<Pair<String, SelectionConstraints>>> getEoscDatasourceMap() {
|
||||
return eoscDatasourceMap;
|
||||
|
@ -71,6 +73,14 @@ public class CommunityConfiguration implements Serializable {
|
|||
this.selectionConstraintsMap = selectionConstraintsMap;
|
||||
}
|
||||
|
||||
public Map<String, SelectionConstraints> getRemoveConstraintsMap() {
|
||||
return removeConstraintsMap;
|
||||
}
|
||||
|
||||
public void setRemoveConstraintsMap(Map<String, SelectionConstraints> removeConstraintsMap) {
|
||||
this.removeConstraintsMap = removeConstraintsMap;
|
||||
}
|
||||
|
||||
CommunityConfiguration(final Map<String, Community> communities) {
|
||||
this.communities = communities;
|
||||
init();
|
||||
|
@ -90,6 +100,9 @@ public class CommunityConfiguration implements Serializable {
|
|||
if (selectionConstraintsMap == null) {
|
||||
selectionConstraintsMap = Maps.newHashMap();
|
||||
}
|
||||
if (removeConstraintsMap == null) {
|
||||
removeConstraintsMap = Maps.newHashMap();
|
||||
}
|
||||
|
||||
for (Community c : getCommunities().values()) {
|
||||
// get subjects
|
||||
|
@ -111,6 +124,8 @@ public class CommunityConfiguration implements Serializable {
|
|||
zenodocommunityMap);
|
||||
}
|
||||
selectionConstraintsMap.put(id, c.getConstraints());
|
||||
|
||||
removeConstraintsMap.put(id, c.getRemoveConstraints());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -86,6 +86,7 @@ public class CommunityConfigurationFactory {
|
|||
c.setProviders(parseDatasources(node));
|
||||
c.setZenodoCommunities(parseZenodoCommunities(node));
|
||||
c.setConstraints(parseConstrains(node));
|
||||
c.setRemoveConstraints(parseRemoveConstrains(node));
|
||||
return c;
|
||||
}
|
||||
|
||||
|
@ -102,6 +103,19 @@ public class CommunityConfigurationFactory {
|
|||
return selectionConstraints;
|
||||
}
|
||||
|
||||
private static SelectionConstraints parseRemoveConstrains(Node node) {
|
||||
Node constsNode = node.selectSingleNode("./removeConstraints");
|
||||
if (constsNode == null || StringUtils.isBlank(StringUtils.trim(constsNode.getText()))) {
|
||||
return new SelectionConstraints();
|
||||
}
|
||||
SelectionConstraints selectionConstraints = new Gson()
|
||||
.fromJson(constsNode.getText(), SelectionConstraints.class);
|
||||
|
||||
selectionConstraints.setSelection(resolver);
|
||||
log.info("number of selection constraints set " + selectionConstraints.getCriteria().size());
|
||||
return selectionConstraints;
|
||||
}
|
||||
|
||||
private static List<String> parseSubjects(final Node node) {
|
||||
|
||||
final List<String> subjects = Lists.newArrayList();
|
||||
|
|
|
@ -79,6 +79,23 @@ public class ResultTagger implements Serializable {
|
|||
break;
|
||||
}
|
||||
|
||||
// communities contains all the communities to be not added to the context
|
||||
final Set<String> removeCommunities = new HashSet<>();
|
||||
|
||||
conf
|
||||
.getRemoveConstraintsMap()
|
||||
.keySet()
|
||||
.forEach(communityId -> {
|
||||
if (conf.getRemoveConstraintsMap().get(communityId).getCriteria() != null &&
|
||||
conf
|
||||
.getRemoveConstraintsMap()
|
||||
.get(communityId)
|
||||
.getCriteria()
|
||||
.stream()
|
||||
.anyMatch(crit -> crit.verifyCriteria(param)))
|
||||
removeCommunities.add(communityId);
|
||||
});
|
||||
|
||||
// communities contains all the communities to be added as context for the result
|
||||
final Set<String> communities = new HashSet<>();
|
||||
|
||||
|
@ -164,7 +181,8 @@ public class ResultTagger implements Serializable {
|
|||
.getSelectionConstraintsMap()
|
||||
.keySet()
|
||||
.forEach(communityId -> {
|
||||
if (conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null &&
|
||||
if (!removeCommunities.contains(communityId) &&
|
||||
conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null &&
|
||||
conf
|
||||
.getSelectionConstraintsMap()
|
||||
.get(communityId)
|
||||
|
@ -175,6 +193,9 @@ public class ResultTagger implements Serializable {
|
|||
});
|
||||
|
||||
communities.addAll(aconstraints);
|
||||
|
||||
communities.removeAll(removeCommunities);
|
||||
|
||||
if (aconstraints.size() > 0)
|
||||
log.info("Found {} for advancedConstraints ", aconstraints.size());
|
||||
|
||||
|
|
|
@ -10,6 +10,9 @@ where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] and $x//con
|
|||
return
|
||||
<community>
|
||||
{ $x//CONFIGURATION/context/@id}
|
||||
<removeConstraints>
|
||||
{$x//CONFIGURATION/context/param[./@name='removeConstraints']/text() }
|
||||
</removeConstraints>
|
||||
<advancedConstraints>
|
||||
{$x//CONFIGURATION/context/param[./@name='advancedConstraints']/text() }
|
||||
</advancedConstraints>
|
||||
|
|
|
@ -39,8 +39,10 @@ public class BulkTagJobTest {
|
|||
+ " \"contributor\" : \"$['contributor'][*]['value']\","
|
||||
+ " \"description\" : \"$['description'][*]['value']\", "
|
||||
+ " \"subject\" :\"$['subject'][*]['value']\" , " +
|
||||
"\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='FOS')].value\"" +
|
||||
"} ";
|
||||
"\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='FOS')].value\"," +
|
||||
"\"sdg\" : \"$['subject'][?(@['qualifier']['classid']=='SDG')].value\"," +
|
||||
"\"hostedby\" : \"$['instance'][*]['hostedby']['key']\" , " +
|
||||
"\"collectedfrom\" : \"$['instance'][*]['collectedfrom']['key']\"} ";
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
|
@ -56,7 +58,7 @@ public class BulkTagJobTest {
|
|||
.toString(
|
||||
BulkTagJobTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf_dth.xml"));
|
||||
"/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf_remove.xml"));
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
@ -1525,4 +1527,45 @@ public class BulkTagJobTest {
|
|||
.count());
|
||||
}
|
||||
|
||||
@Test
|
||||
void removeTest() throws Exception {
|
||||
final String pathMap = BulkTagJobTest.pathMap;
|
||||
SparkBulkTagJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isTest", Boolean.TRUE.toString(),
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath",
|
||||
getClass()
|
||||
.getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints")
|
||||
.getPath(),
|
||||
"-taggingConf", taggingConf,
|
||||
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
|
||||
"-outputPath", workingDir.toString() + "/dataset",
|
||||
"-isLookUpUrl", MOCK_IS_LOOK_UP_URL,
|
||||
"-pathMap", pathMap
|
||||
});
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Dataset> tmp = sc
|
||||
.textFile(workingDir.toString() + "/dataset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
|
||||
|
||||
Assertions.assertEquals(12, tmp.count());
|
||||
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
|
||||
|
||||
verificationDataset.createOrReplaceTempView("dataset");
|
||||
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
|
||||
+ "from dataset "
|
||||
+ "lateral view explode(context) c as MyT "
|
||||
+ "lateral view explode(MyT.datainfo) d as MyD "
|
||||
+ "where MyD.inferenceprovenance = 'bulktagging'";
|
||||
|
||||
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
|
||||
|
||||
Assertions.assertEquals(3, idExplodeCommunity.filter("community = 'dth'").count());
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
@ -21,7 +21,7 @@
|
|||
</property>
|
||||
<property>
|
||||
<name>hive_jdbc_url</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=19166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=11596411699;spark.yarn.driver.memoryOverhead=1228</value>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.wf.workflow.notification.url</name>
|
||||
|
|
|
@ -68,6 +68,16 @@ copydb $USAGE_STATS_DB
|
|||
copydb $PROD_USAGE_STATS_DB
|
||||
copydb $EXT_DB
|
||||
copydb $STATS_DB
|
||||
copydb $MONITOR_DB
|
||||
#copydb $MONITOR_DB
|
||||
copydb $OBSERVATORY_DB
|
||||
|
||||
copydb $MONITOR_DB'_funded'
|
||||
copydb $MONITOR_DB'_institutions'
|
||||
copydb $MONITOR_DB'_RIs_tail'
|
||||
|
||||
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
|
||||
for i in ${contexts}
|
||||
do
|
||||
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
|
||||
copydb ${MONITOR_DB}'_'${tmp}
|
||||
done
|
|
@ -29,3 +29,14 @@ createShadowDB $STATS_DB $STATS_DB_SHADOW
|
|||
createShadowDB $MONITOR_DB $MONITOR_DB_SHADOW
|
||||
createShadowDB $OBSERVATORY_DB $OBSERVATORY_DB_SHADOW
|
||||
createShadowDB USAGE_STATS_DB USAGE_STATS_DB_SHADOW
|
||||
|
||||
createShadowDB $MONITOR_DB'_funded' $MONITOR_DB'_funded_shadow'
|
||||
createShadowDB $MONITOR_DB'_institutions' $MONITOR_DB'_institutions_shadow'
|
||||
createShadowDB $MONITOR_DB'_RIs_tail' $MONITOR_DB'_RIs_tail_shadow'
|
||||
|
||||
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
|
||||
for i in ${contexts}
|
||||
do
|
||||
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
|
||||
createShadowDB ${MONITOR_DB}'_'${tmp} ${MONITOR_DB}'_'${tmp}'_shadow'
|
||||
done
|
|
@ -10,16 +10,88 @@ export SOURCE=$1
|
|||
export TARGET=$2
|
||||
export SHADOW=$3
|
||||
export SCRIPT_PATH=$4
|
||||
export SCRIPT_PATH2=$5
|
||||
export SCRIPT_PATH3=$6
|
||||
export SCRIPT_PATH4=$7
|
||||
export SCRIPT_PATH5=$8
|
||||
|
||||
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
|
||||
export HADOOP_USER_NAME="oozie"
|
||||
|
||||
echo "Getting file from " $SCRIPT_PATH
|
||||
hdfs dfs -copyToLocal $SCRIPT_PATH
|
||||
echo "Getting file from " $4
|
||||
hdfs dfs -copyToLocal $4
|
||||
|
||||
echo "Getting file from " $5
|
||||
hdfs dfs -copyToLocal $5
|
||||
|
||||
echo "Getting file from " $6
|
||||
hdfs dfs -copyToLocal $6
|
||||
|
||||
echo "Getting file from " $7
|
||||
hdfs dfs -copyToLocal $7
|
||||
|
||||
echo "Getting file from " $8
|
||||
hdfs dfs -copyToLocal $8
|
||||
|
||||
echo "Creating monitor database"
|
||||
#cat step20-createMonitorDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 > foo
|
||||
cat step20-createMonitorDB.sql | sed "s/TARGET/${TARGET}/g" | sed "s/SOURCE/${SOURCE}/g1" > foo
|
||||
cat step20-createMonitorDB_funded.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_funded/g1" > foo
|
||||
hive $HIVE_OPTS -f foo
|
||||
cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_funded/g1" > foo
|
||||
hive $HIVE_OPTS -f foo
|
||||
#
|
||||
cat step20-createMonitorDB_institutions.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo
|
||||
hive $HIVE_OPTS -f foo
|
||||
cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo
|
||||
hive $HIVE_OPTS -f foo
|
||||
|
||||
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
|
||||
|
||||
for i in ${contexts}
|
||||
do
|
||||
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
|
||||
tmp2=`echo "$i" |sed 's/:.*//' `
|
||||
cat step20-createMonitorDB_RIs.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_$tmp/g1" | sed "s/CONTEXT/\'%$tmp2%\'/g" > foo
|
||||
hive $HIVE_OPTS -f foo
|
||||
cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_$tmp/g1" > foo
|
||||
hive $HIVE_OPTS -f foo
|
||||
done
|
||||
|
||||
|
||||
cat step20-createMonitorDB_RIs_tail.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_RIs_tail/g1" | sed "s/CONTEXTS/\"'knowmad::other','dh-ch::other', 'enermaps::other', 'gotriple::other', 'neanias-atmospheric::other', 'rural-digital-europe::other', 'covid-19::other', 'aurora::other', 'neanias-space::other', 'north-america-studies::other', 'north-american-studies::other', 'eutopia::other'\"/g" > foo
|
||||
hive $HIVE_OPTS -f foo
|
||||
cat step20-createMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_RIs_tail/g1" > foo
|
||||
hive $HIVE_OPTS -f foo
|
||||
|
||||
echo "Hive shell finished"
|
||||
|
||||
echo "Updating shadow monitor funded database"
|
||||
hive -e "drop database if exists ${SHADOW}_funded cascade"
|
||||
hive -e "create database if not exists ${SHADOW}_funded"
|
||||
hive $HIVE_OPTS --database ${2}_funded -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_funded.\1 as select * from ${2}_funded.\1;/" > foo
|
||||
hive -f foo
|
||||
echo "Updated shadow monitor funded database"
|
||||
|
||||
echo "Updating shadow monitor insitutions database"
|
||||
hive -e "drop database if exists ${SHADOW}_institutions cascade"
|
||||
hive -e "create database if not exists ${SHADOW}_institutions"
|
||||
hive $HIVE_OPTS --database ${2}_institutions -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_institutions.\1 as select * from ${2}_institutions.\1;/" > foo
|
||||
hive -f foo
|
||||
echo "Shadow db monitor insitutions ready!"
|
||||
|
||||
echo "Updating shadow monitor RIs database"
|
||||
for i in $contexts
|
||||
do
|
||||
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
|
||||
hive -e "drop database if exists ${SHADOW}_${tmp} cascade"
|
||||
hive -e "create database if not exists ${SHADOW}_${tmp}"
|
||||
hive $HIVE_OPTS --database ${2}_${tmp} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_${tmp}.\1 as select * from ${2}_${tmp}.\1;/" > foo
|
||||
hive -f foo
|
||||
done
|
||||
echo "Shadow db monitor RIs ready!"
|
||||
|
||||
echo "Updating shadow monitor RIs tail database"
|
||||
hive -e "drop database if exists ${SHADOW}_ris_tail cascade"
|
||||
hive -e "create database if not exists ${SHADOW}_ris_tail"
|
||||
hive $HIVE_OPTS --database ${2}_ris_tail -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_ris_tail.\1 as select * from ${2}_ris_tail.\1;/" > foo
|
||||
hive -f foo
|
||||
echo "Shadow db monitor RIs tail ready!"
|
||||
|
|
|
@ -46,4 +46,8 @@ FROM (
|
|||
LEFT OUTER JOIN (
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id;
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
|
||||
select distinct substr(id,4),id, accessroute from ${openaire_db_name}.result
|
||||
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute;
|
||||
|
|
|
@ -92,53 +92,59 @@ ANALYZE TABLE indi_funded_result_with_fundref COMPUTE STATISTICS;
|
|||
--
|
||||
-- compute stats indi_result_org_collab;
|
||||
--
|
||||
create TEMPORARY TABLE tmp AS SELECT ro.organization organization, ro.id from result_organization ro
|
||||
create TEMPORARY TABLE tmp AS SELECT ro.organization organization, ro.id, o.name from result_organization ro
|
||||
join organization o on o.id=ro.organization where o.name is not null;
|
||||
|
||||
create table if not exists indi_result_org_collab stored as parquet as
|
||||
select o1.organization org1, o2.organization org2, count(o1.id) as collaborations
|
||||
select o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations
|
||||
from tmp as o1
|
||||
join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization
|
||||
group by o1.organization, o2.organization;
|
||||
join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name
|
||||
group by o1.organization, o2.organization, o1.name, o2.name;
|
||||
|
||||
drop table tmp purge;
|
||||
|
||||
ANALYZE TABLE indi_result_org_collab COMPUTE STATISTICS;
|
||||
|
||||
create TEMPORARY TABLE tmp AS
|
||||
select distinct ro.organization organization, ro.id, o.country from result_organization ro
|
||||
select distinct ro.organization organization, ro.id, o.name, o.country from result_organization ro
|
||||
join organization o on o.id=ro.organization where country <> 'UNKNOWN' and o.name is not null;
|
||||
|
||||
create table if not exists indi_result_org_country_collab stored as parquet as
|
||||
select o1.organization org1,o2.country country2, count(o1.id) as collaborations
|
||||
select o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations
|
||||
from tmp as o1 join tmp as o2 on o1.id=o2.id
|
||||
where o1.id=o2.id and o1.country!=o2.country
|
||||
group by o1.organization, o1.id, o2.country;
|
||||
group by o1.organization, o1.id, o1.name, o2.country;
|
||||
|
||||
drop table tmp purge;
|
||||
|
||||
ANALYZE TABLE indi_result_org_country_collab COMPUTE STATISTICS;
|
||||
|
||||
create TEMPORARY TABLE tmp AS
|
||||
select o.id organization, o.name, ro.project as project from organization o
|
||||
join organization_projects ro on o.id=ro.id where o.name is not null;
|
||||
|
||||
create table if not exists indi_project_collab_org stored as parquet as
|
||||
select o1.id org1,o2.id org2, count(distinct o1.project) as collaborations
|
||||
from organization_projects as o1
|
||||
join organization_projects as o2 on o1.project=o2.project
|
||||
where o1.id!=o2.id
|
||||
group by o1.id, o2.id;
|
||||
select o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations
|
||||
from tmp as o1
|
||||
join tmp as o2 on o1.project=o2.project
|
||||
where o1.organization<>o2.organization and o1.name<>o2.name
|
||||
group by o1.name,o2.name, o1.organization, o2.organization;
|
||||
|
||||
drop table tmp purge;
|
||||
|
||||
ANALYZE TABLE indi_project_collab_org COMPUTE STATISTICS;
|
||||
|
||||
create TEMPORARY TABLE tmp AS
|
||||
select o.id organization, o.country , ro.project as project from organization o
|
||||
select o.id organization, o.name, o.country , ro.project as project from organization o
|
||||
join organization_projects ro on o.id=ro.id
|
||||
and o.country <> 'UNKNOWN';
|
||||
and o.country <> 'UNKNOWN' and o.name is not null;
|
||||
|
||||
create table if not exists indi_project_collab_org_country stored as parquet as
|
||||
select o1.organization org1,o2.country country2, count(distinct o1.project) as collaborations
|
||||
select o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations
|
||||
from tmp as o1
|
||||
join tmp as o2 on o1.project=o2.project
|
||||
where o1.organization<>o2.organization and o1.country<>o2.country
|
||||
group by o1.organization, o2.country;
|
||||
group by o1.organization, o2.country, o1.name;
|
||||
|
||||
drop table tmp purge;
|
||||
|
||||
|
@ -217,38 +223,6 @@ select id, count(id) as number_of_copies from result_instance group by id;
|
|||
ANALYZE TABLE indi_result_no_of_copies COMPUTE STATISTICS;
|
||||
|
||||
---- Sprint 6 ----
|
||||
create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as
|
||||
WITH hybrid_oa AS (
|
||||
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
|
||||
FROM STATS_EXT.plan_s_jn
|
||||
WHERE issn_print != ""
|
||||
UNION ALL
|
||||
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn
|
||||
FROM STATS_EXT.plan_s_jn
|
||||
WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)),
|
||||
issn AS (
|
||||
SELECT *
|
||||
FROM (
|
||||
SELECT id, issn_printed as issn
|
||||
FROM datasource
|
||||
WHERE issn_printed IS NOT NULL
|
||||
UNION ALL
|
||||
SELECT id,issn_online as issn
|
||||
FROM datasource
|
||||
WHERE issn_online IS NOT NULL ) as issn
|
||||
WHERE LENGTH(issn) > 7)
|
||||
SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa
|
||||
FROM publication_datasources pd
|
||||
LEFT OUTER JOIN (
|
||||
SELECT pd.id, 1 as is_hybrid_oa from publication_datasources pd
|
||||
JOIN datasource d on d.id=pd.datasource
|
||||
JOIN issn on issn.id=pd.datasource
|
||||
JOIN hybrid_oa ON issn.issn = hybrid_oa.issn
|
||||
JOIN indi_result_has_cc_licence cc on pd.id=cc.id
|
||||
where cc.has_cc_license=1) tmp on pd.id=tmp.id;
|
||||
|
||||
ANALYZE TABLE indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
|
||||
|
||||
create table if not exists indi_pub_downloads stored as parquet as
|
||||
SELECT result_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats
|
||||
join publication on result_id=id
|
||||
|
@ -335,6 +309,73 @@ FROM
|
|||
|
||||
ANALYZE TABLE indi_pub_gold_oa COMPUTE STATISTICS;
|
||||
|
||||
create table if not exists indi_pub_hybrid_oa_with_cc stored as parquet as
|
||||
WITH hybrid_oa AS (
|
||||
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
|
||||
FROM STATS_EXT.plan_s_jn
|
||||
WHERE issn_print != ""
|
||||
UNION ALL
|
||||
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn
|
||||
FROM STATS_EXT.plan_s_jn
|
||||
WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)),
|
||||
issn AS (
|
||||
SELECT *
|
||||
FROM (
|
||||
SELECT id, issn_printed as issn
|
||||
FROM datasource
|
||||
WHERE issn_printed IS NOT NULL
|
||||
UNION ALL
|
||||
SELECT id,issn_online as issn
|
||||
FROM datasource
|
||||
WHERE issn_online IS NOT NULL ) as issn
|
||||
WHERE LENGTH(issn) > 7)
|
||||
SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa
|
||||
FROM publication_datasources pd
|
||||
LEFT OUTER JOIN (
|
||||
SELECT pd.id, 1 as is_hybrid_oa from publication_datasources pd
|
||||
JOIN datasource d on d.id=pd.datasource
|
||||
JOIN issn on issn.id=pd.datasource
|
||||
JOIN hybrid_oa ON issn.issn = hybrid_oa.issn
|
||||
JOIN indi_result_has_cc_licence cc on pd.id=cc.id
|
||||
JOIN indi_pub_gold_oa ga on pd.id=ga.id
|
||||
where cc.has_cc_license=1 and ga.is_gold=0) tmp on pd.id=tmp.id;
|
||||
|
||||
ANALYZE TABLE indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
|
||||
|
||||
create table if not exists indi_pub_bronze_oa stored as parquet as
|
||||
WITH hybrid_oa AS (
|
||||
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
|
||||
FROM STATS_EXT.plan_s_jn
|
||||
WHERE issn_print != ""
|
||||
UNION ALL
|
||||
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn
|
||||
FROM STATS_EXT.plan_s_jn
|
||||
WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)),
|
||||
issn AS (
|
||||
SELECT *
|
||||
FROM (
|
||||
SELECT id, issn_printed as issn
|
||||
FROM datasource
|
||||
WHERE issn_printed IS NOT NULL
|
||||
UNION ALL
|
||||
SELECT id,issn_online as issn
|
||||
FROM datasource
|
||||
WHERE issn_online IS NOT NULL ) as issn
|
||||
WHERE LENGTH(issn) > 7)
|
||||
SELECT DISTINCT pd.id, coalesce(is_bronze_oa, 0) as is_hybrid_oa
|
||||
FROM publication_datasources pd
|
||||
LEFT OUTER JOIN (
|
||||
SELECT pd.id, 1 as is_bronze_oa from publication_datasources pd
|
||||
JOIN datasource d on d.id=pd.datasource
|
||||
JOIN issn on issn.id=pd.datasource
|
||||
JOIN hybrid_oa ON issn.issn = hybrid_oa.issn
|
||||
JOIN indi_result_has_cc_licence cc on pd.id=cc.id
|
||||
JOIN indi_pub_gold_oa ga on pd.id=ga.id
|
||||
JOIN indi_pub_hybrid_oa_with_cc hy on hy.id=pd.id
|
||||
where cc.has_cc_license=0 and ga.is_gold=0 and hy.is_hybrid_oa=0) tmp on pd.id=tmp.id;
|
||||
|
||||
ANALYZE TABLE indi_pub_bronze_oa COMPUTE STATISTICS;
|
||||
|
||||
create table if not exists indi_pub_hybrid stored as parquet as
|
||||
WITH gold_oa AS ( SELECT
|
||||
issn_l,
|
||||
|
@ -733,3 +774,27 @@ from result p
|
|||
on p.id= tmp.id;
|
||||
|
||||
ANALYZE TABLE indi_result_with_pid COMPUTE STATISTICS;
|
||||
|
||||
create table if not exists indi_impact_measures as
|
||||
select distinct substr(id, 4), measures_ids.id impactmetric, measures_ids.unit.value[0] score,
|
||||
cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] class
|
||||
from result lateral view explode(measures) measures as measures_ids
|
||||
where measures_ids.id!='views' and measures_ids.id!='downloads';
|
||||
|
||||
ANALYZE TABLE indi_impact_measures COMPUTE STATISTICS;
|
||||
|
||||
CREATE TEMPORARY TABLE pub_fos_totals as
|
||||
select rf.id, count(distinct lvl3) totals from result_fos rf
|
||||
group by rf.id;
|
||||
|
||||
create table if not exists indi_pub_interdisciplinarity as
|
||||
select distinct p.id, coalesce(indi_pub_is_interdisciplinary, 0)
|
||||
as indi_pub_is_interdisciplinary
|
||||
from pub_fos_totals p
|
||||
left outer join (
|
||||
select pub_fos_totals.id, 1 as indi_pub_is_interdisciplinary from pub_fos_totals
|
||||
where totals>10) tmp on p.id=tmp.id;
|
||||
|
||||
drop table pub_fos_totals purge;
|
||||
|
||||
ANALYZE TABLE indi_pub_interdisciplinarity COMPUTE STATISTICS;
|
|
@ -1,5 +1,78 @@
|
|||
drop database if exists TARGET cascade;
|
||||
create database if not exists TARGET;
|
||||
--drop database if exists TARGET cascade;
|
||||
--create database if not exists TARGET;
|
||||
--
|
||||
--create view if not exists TARGET.category as select * from SOURCE.category;
|
||||
--create view if not exists TARGET.concept as select * from SOURCE.concept;
|
||||
--create view if not exists TARGET.context as select * from SOURCE.context;
|
||||
--create view if not exists TARGET.country as select * from SOURCE.country;
|
||||
--create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
|
||||
--create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
|
||||
--create view if not exists TARGET.funder as select * from SOURCE.funder;
|
||||
--create view if not exists TARGET.fundref as select * from SOURCE.fundref;
|
||||
--create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
|
||||
--create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
|
||||
--create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
|
||||
--create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
|
||||
--create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
|
||||
--create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
|
||||
--
|
||||
--create table TARGET.result stored as parquet as
|
||||
-- select distinct * from (
|
||||
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
|
||||
-- union all
|
||||
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
|
||||
-- union all
|
||||
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
|
||||
-- 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
|
||||
-- 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
|
||||
-- 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
|
||||
-- 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
|
||||
-- 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
|
||||
-- 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
|
||||
-- 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
|
||||
-- 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
|
||||
-- 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
|
||||
-- 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
|
||||
-- -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
|
||||
-- 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
|
||||
-- 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
|
||||
-- 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
|
||||
-- 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
|
||||
-- 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
|
||||
-- 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
|
||||
-- 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
|
||||
-- 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
|
||||
-- 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
|
||||
-- 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
|
||||
-- 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
|
||||
-- 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
|
||||
-- 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
|
||||
-- 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
|
||||
-- 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
|
||||
-- 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
|
||||
-- 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
|
||||
-- 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
|
||||
-- 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
|
||||
-- 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
|
||||
-- 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
|
||||
-- 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
|
||||
-- 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
|
||||
-- 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
|
||||
-- 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
|
||||
-- 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
|
||||
-- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
|
||||
-- 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
|
||||
-- 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
|
||||
-- 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
|
||||
-- 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
|
||||
-- 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
|
||||
-- 'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba
|
||||
-- 'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili
|
||||
-- 'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University
|
||||
-- 'openorgs____::3cff625a4370d51e08624cc586138b2f' -- IMT Atlantique
|
||||
-- ) )) foo;
|
||||
--
|
||||
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
||||
|
||||
create view if not exists TARGET.category as select * from SOURCE.category;
|
||||
create view if not exists TARGET.concept as select * from SOURCE.concept;
|
||||
|
@ -16,61 +89,6 @@ create view if not exists TARGET.totalresearchers as select * from SOURCE.totalr
|
|||
create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
|
||||
create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
|
||||
|
||||
create table TARGET.result stored as parquet as
|
||||
select distinct * from (
|
||||
select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
|
||||
union all
|
||||
select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
|
||||
union all
|
||||
select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
|
||||
'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
|
||||
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
|
||||
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
|
||||
'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
|
||||
'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
|
||||
'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
|
||||
'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
|
||||
'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
|
||||
'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
|
||||
'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
|
||||
-- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
|
||||
'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
|
||||
'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
|
||||
'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
|
||||
'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
|
||||
'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
|
||||
'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
|
||||
'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
|
||||
'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
|
||||
'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
|
||||
'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
|
||||
'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
|
||||
'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
|
||||
'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
|
||||
'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
|
||||
'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
|
||||
'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
|
||||
'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
|
||||
'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
|
||||
'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
|
||||
'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
|
||||
'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
|
||||
'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
|
||||
'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
|
||||
'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
|
||||
'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
|
||||
'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
|
||||
'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
|
||||
'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
|
||||
'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
|
||||
'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
|
||||
'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
|
||||
'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
|
||||
'openorgs____::846cb428d3f52a445f7275561a7beb5d' -- University of Manitoba
|
||||
) )) foo;
|
||||
|
||||
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS;
|
||||
|
||||
|
@ -140,6 +158,9 @@ ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS;
|
|||
create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
ANALYZE TABLE TARGET.result_accessroute COMPUTE STATISTICS;
|
||||
|
||||
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
|
||||
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
|
||||
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
|
||||
|
@ -213,6 +234,8 @@ ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS;
|
|||
---- Sprint 6 ----
|
||||
create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
ANALYZE TABLE TARGET.indi_pub_bronze_oa COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
|
@ -241,3 +264,7 @@ create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SO
|
|||
ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS;
|
||||
create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS;
|
||||
create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
ANALYZE TABLE TARGET.indi_impact_measures COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
ANALYZE TABLE TARGET.indi_pub_interdisciplinarity COMPUTE STATISTICS;
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
drop database if exists TARGET cascade;
|
||||
create database if not exists TARGET;
|
||||
|
||||
create table TARGET.result stored as parquet as
|
||||
select distinct * from (
|
||||
select * from SOURCE.result r where exists
|
||||
(select 1
|
||||
from SOURCE.result_concepts rc
|
||||
join SOURCE.concept conc on conc.id=rc.concept
|
||||
join SOURCE.category cat on cat.id=conc.category
|
||||
join SOURCE.context cont on cont.id=cat.context
|
||||
-- join SOURCE.result
|
||||
where rc.id=r.id and conc.category like CONTEXT)
|
||||
) foo;
|
||||
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
|
@ -0,0 +1,15 @@
|
|||
drop database if exists TARGET cascade;
|
||||
create database if not exists TARGET;
|
||||
|
||||
create table TARGET.result stored as parquet as
|
||||
select distinct * from (
|
||||
select * from SOURCE.result r where exists
|
||||
(select 1
|
||||
from SOURCE.result_concepts rc
|
||||
join SOURCE.concept conc on conc.id=rc.concept
|
||||
join SOURCE.category cat on cat.id=conc.category
|
||||
join SOURCE.context cont on cont.id=cat.context
|
||||
-- join SOURCE.result
|
||||
where rc.id=r.id and conc.category not in (CONTEXTS))
|
||||
) foo;
|
||||
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
|
@ -0,0 +1,9 @@
|
|||
drop database if exists TARGET cascade;
|
||||
create database if not exists TARGET;
|
||||
|
||||
create table TARGET.result stored as parquet as
|
||||
select distinct * from (
|
||||
select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
|
||||
) foo;
|
||||
|
||||
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
|
@ -0,0 +1,56 @@
|
|||
drop database if exists TARGET cascade;
|
||||
create database if not exists TARGET;
|
||||
|
||||
create table TARGET.result stored as parquet as
|
||||
select distinct * from (
|
||||
select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
|
||||
'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
|
||||
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
|
||||
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
|
||||
'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
|
||||
'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
|
||||
'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
|
||||
'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
|
||||
'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
|
||||
'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
|
||||
'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
|
||||
-- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
|
||||
'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
|
||||
'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
|
||||
'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
|
||||
'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
|
||||
'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
|
||||
'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
|
||||
'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
|
||||
'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
|
||||
'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
|
||||
'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
|
||||
'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
|
||||
'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
|
||||
'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
|
||||
'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
|
||||
'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
|
||||
'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
|
||||
'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
|
||||
'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
|
||||
'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
|
||||
'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
|
||||
'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
|
||||
'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
|
||||
'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
|
||||
'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
|
||||
'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
|
||||
'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
|
||||
'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
|
||||
'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
|
||||
'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
|
||||
'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
|
||||
'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
|
||||
'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
|
||||
'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba
|
||||
'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili
|
||||
'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University
|
||||
'openorgs____::3cff625a4370d51e08624cc586138b2f' -- IMT Atlantique
|
||||
))) foo;
|
||||
|
||||
ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
|
@ -374,25 +374,29 @@
|
|||
<argument>${monitor_db_name}</argument>
|
||||
<argument>${monitor_db_shadow_name}</argument>
|
||||
<argument>${wf:appPath()}/scripts/step20-createMonitorDB.sql</argument>
|
||||
<argument>${wf:appPath()}/scripts/step20-createMonitorDB_funded.sql</argument>
|
||||
<argument>${wf:appPath()}/scripts/step20-createMonitorDB_institutions.sql</argument>
|
||||
<argument>${wf:appPath()}/scripts/step20-createMonitorDB_RIs.sql</argument>
|
||||
<argument>${wf:appPath()}/scripts/step20-createMonitorDB_RIs_tail.sql</argument>
|
||||
<file>monitor.sh</file>
|
||||
</shell>
|
||||
<ok to="step20-createMonitorDB-post"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="step20-createMonitorDB-post">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>monitor-post.sh</exec>
|
||||
<argument>${monitor_db_name}</argument>
|
||||
<argument>${monitor_db_shadow_name}</argument>
|
||||
<file>monitor-post.sh</file>
|
||||
</shell>
|
||||
<ok to="step21-createObservatoryDB-pre"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- <action name="step20-createMonitorDB-post">-->
|
||||
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
|
||||
<!-- <job-tracker>${jobTracker}</job-tracker>-->
|
||||
<!-- <name-node>${nameNode}</name-node>-->
|
||||
<!-- <exec>monitor-post.sh</exec>-->
|
||||
<!-- <argument>${monitor_db_name}</argument>-->
|
||||
<!-- <argument>${monitor_db_shadow_name}</argument>-->
|
||||
<!-- <file>monitor-post.sh</file>-->
|
||||
<!-- </shell>-->
|
||||
<!-- <ok to="step21-createObservatoryDB-pre"/>-->
|
||||
<!-- <error to="Kill"/>-->
|
||||
<!-- </action>-->
|
||||
|
||||
<action name="step21-createObservatoryDB-pre">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
|
|
Loading…
Reference in New Issue