forked from D-Net/dnet-hadoop
Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta
This commit is contained in:
commit
922c6d66ef
|
@ -95,13 +95,14 @@ public class SparkEoscTag {
|
||||||
|
|
||||||
if (containsCriteriaNotebook(s)) {
|
if (containsCriteriaNotebook(s)) {
|
||||||
sbject.add(EOSC_NOTEBOOK);
|
sbject.add(EOSC_NOTEBOOK);
|
||||||
if (sbject.stream().anyMatch(sb -> sb.getValue().equals("EOSC Jupyter Notebook"))){
|
if (sbject.stream().anyMatch(sb -> sb.getValue().equals("EOSC Jupyter Notebook"))) {
|
||||||
sbject = sbject.stream().map(sb -> {
|
sbject = sbject.stream().map(sb -> {
|
||||||
if (sb.getValue().equals("EOSC Jupyter Notebook")){
|
if (sb.getValue().equals("EOSC Jupyter Notebook")) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
return sb;
|
return sb;
|
||||||
}).filter(Objects::nonNull).collect(Collectors.toList());
|
}).filter(Objects::nonNull).collect(Collectors.toList());
|
||||||
|
s.setSubject(sbject);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (containsCriteriaGalaxy(s)) {
|
if (containsCriteriaGalaxy(s)) {
|
||||||
|
|
|
@ -102,21 +102,28 @@ public class SparkCountryPropagationJob {
|
||||||
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {
|
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {
|
||||||
return t -> {
|
return t -> {
|
||||||
Optional.ofNullable(t._2()).ifPresent(r -> {
|
Optional.ofNullable(t._2()).ifPresent(r -> {
|
||||||
t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet()));
|
if (Optional.ofNullable(t._1().getCountry()).isPresent())
|
||||||
|
t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet()));
|
||||||
|
else
|
||||||
|
t._1().setCountry(merge(null, t._2().getCountrySet()));
|
||||||
});
|
});
|
||||||
return t._1();
|
return t._1();
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
private static List<Country> merge(List<Country> c1, List<CountrySbs> c2) {
|
private static List<Country> merge(List<Country> c1, List<CountrySbs> c2) {
|
||||||
HashSet<String> countries = c1
|
HashSet<String> countries = new HashSet<>();
|
||||||
.stream()
|
if (Optional.ofNullable(c1).isPresent()) {
|
||||||
.map(Qualifier::getClassid)
|
countries = c1
|
||||||
.collect(Collectors.toCollection(HashSet::new));
|
.stream()
|
||||||
|
.map(Qualifier::getClassid)
|
||||||
|
.collect(Collectors.toCollection(HashSet::new));
|
||||||
|
}
|
||||||
|
|
||||||
|
HashSet<String> finalCountries = countries;
|
||||||
return c2
|
return c2
|
||||||
.stream()
|
.stream()
|
||||||
.filter(c -> !countries.contains(c.getClassid()))
|
.filter(c -> !finalCountries.contains(c.getClassid()))
|
||||||
.map(c -> getCountry(c.getClassid(), c.getClassname()))
|
.map(c -> getCountry(c.getClassid(), c.getClassname()))
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
|
@ -132,7 +132,7 @@ public class EOSCTagJobTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
2, tmp
|
1, tmp
|
||||||
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
|
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
|
||||||
.collect()
|
.collect()
|
||||||
.get(0)
|
.get(0)
|
||||||
|
@ -326,7 +326,7 @@ public class EOSCTagJobTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
2,
|
1,
|
||||||
tmp
|
tmp
|
||||||
.filter(
|
.filter(
|
||||||
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
|
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
|
||||||
|
@ -352,21 +352,12 @@ public class EOSCTagJobTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
6, tmp
|
5, tmp
|
||||||
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
|
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
|
||||||
.collect()
|
.collect()
|
||||||
.get(0)
|
.get(0)
|
||||||
.getSubject()
|
.getSubject()
|
||||||
.size());
|
.size());
|
||||||
Assertions
|
|
||||||
.assertTrue(
|
|
||||||
tmp
|
|
||||||
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
|
|
||||||
.collect()
|
|
||||||
.get(0)
|
|
||||||
.getSubject()
|
|
||||||
.stream()
|
|
||||||
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
|
@ -394,7 +385,7 @@ public class EOSCTagJobTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
2,
|
1,
|
||||||
orp
|
orp
|
||||||
.filter(
|
.filter(
|
||||||
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
|
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
|
||||||
|
@ -438,14 +429,14 @@ public class EOSCTagJobTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
3, orp
|
2, orp
|
||||||
.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
|
.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
|
||||||
.collect()
|
.collect()
|
||||||
.get(0)
|
.get(0)
|
||||||
.getSubject()
|
.getSubject()
|
||||||
.size());
|
.size());
|
||||||
Assertions
|
Assertions
|
||||||
.assertTrue(
|
.assertFalse(
|
||||||
orp
|
orp
|
||||||
.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
|
.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
|
||||||
.collect()
|
.collect()
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -81,3 +81,33 @@ where reltype='resultResult'
|
||||||
and r1.resulttype.classname != 'other'
|
and r1.resulttype.classname != 'other'
|
||||||
and r2.resulttype.classname != 'other'
|
and r2.resulttype.classname != 'other'
|
||||||
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE;
|
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE;
|
||||||
|
|
||||||
|
create table ${stats_db_name}.result_citations_oc stored as parquet as
|
||||||
|
select substr(target, 4) as id, count(distinct substr(source, 4)) as citations
|
||||||
|
from ${openaire_db_name}.relation rel
|
||||||
|
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
||||||
|
join ${openaire_db_name}.result r2 on r2.id=rel.target
|
||||||
|
where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:crosswalk:opencitations'
|
||||||
|
and reltype='resultResult'
|
||||||
|
and r1.resulttype.classname!=r2.resulttype.classname
|
||||||
|
and r1.datainfo.deletedbyinference=false and r1.datainfo.invisible = FALSE
|
||||||
|
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
|
||||||
|
and r1.resulttype.classname != 'other'
|
||||||
|
and r2.resulttype.classname != 'other'
|
||||||
|
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
|
||||||
|
group by substr(target, 4);
|
||||||
|
|
||||||
|
create table ${stats_db_name}.result_references_oc stored as parquet as
|
||||||
|
select substr(source, 4) as id, count(distinct substr(target, 4)) as references
|
||||||
|
from ${openaire_db_name}.relation rel
|
||||||
|
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
||||||
|
join ${openaire_db_name}.result r2 on r2.id=rel.target
|
||||||
|
where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:crosswalk:opencitations'
|
||||||
|
and reltype='resultResult'
|
||||||
|
and r1.resulttype.classname!=r2.resulttype.classname
|
||||||
|
and r1.datainfo.deletedbyinference=false and r1.datainfo.invisible = FALSE
|
||||||
|
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
|
||||||
|
and r1.resulttype.classname != 'other'
|
||||||
|
and r2.resulttype.classname != 'other'
|
||||||
|
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
|
||||||
|
group by substr(source, 4);
|
|
@ -82,31 +82,31 @@ on r.id= tmp.id;
|
||||||
|
|
||||||
compute stats indi_funded_result_with_fundref;
|
compute stats indi_funded_result_with_fundref;
|
||||||
|
|
||||||
create table indi_result_org_country_collab stored as parquet as
|
-- create table indi_result_org_country_collab stored as parquet as
|
||||||
with tmp as
|
-- with tmp as
|
||||||
(select o.id as id, o.country , ro.id as result,r.type from organization o
|
-- (select o.id as id, o.country , ro.id as result,r.type from organization o
|
||||||
join result_organization ro on o.id=ro.organization
|
-- join result_organization ro on o.id=ro.organization
|
||||||
join result r on r.id=ro.id where o.country <> 'UNKNOWN')
|
-- join result r on r.id=ro.id where o.country <> 'UNKNOWN')
|
||||||
select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
|
-- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
|
||||||
from tmp as o1
|
-- from tmp as o1
|
||||||
join tmp as o2 on o1.result=o2.result
|
-- join tmp as o2 on o1.result=o2.result
|
||||||
where o1.id<>o2.id and o1.country<>o2.country
|
-- where o1.id<>o2.id and o1.country<>o2.country
|
||||||
group by o1.id, o1.type,o2.country;
|
-- group by o1.id, o1.type,o2.country;
|
||||||
|
--
|
||||||
|
-- compute stats indi_result_org_country_collab;
|
||||||
|
|
||||||
compute stats indi_result_org_country_collab;
|
-- create table indi_result_org_collab stored as parquet as
|
||||||
|
-- with tmp as
|
||||||
create table indi_result_org_collab stored as parquet as
|
-- (select o.id, ro.id as result,r.type from organization o
|
||||||
with tmp as
|
-- join result_organization ro on o.id=ro.organization
|
||||||
(select o.id, ro.id as result,r.type from organization o
|
-- join result r on r.id=ro.id)
|
||||||
join result_organization ro on o.id=ro.organization
|
-- select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations
|
||||||
join result r on r.id=ro.id)
|
-- from tmp as o1
|
||||||
select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations
|
-- join tmp as o2 on o1.result=o2.result
|
||||||
from tmp as o1
|
-- where o1.id<>o2.id
|
||||||
join tmp as o2 on o1.result=o2.result
|
-- group by o1.id, o2.id, o1.type;
|
||||||
where o1.id<>o2.id
|
--
|
||||||
group by o1.id, o2.id, o1.type;
|
-- compute stats indi_result_org_collab;
|
||||||
|
|
||||||
compute stats indi_result_org_collab;
|
|
||||||
|
|
||||||
create table indi_funder_country_collab stored as parquet as
|
create table indi_funder_country_collab stored as parquet as
|
||||||
with tmp as (select funder, project, country from organization_projects op
|
with tmp as (select funder, project, country from organization_projects op
|
||||||
|
|
|
@ -18,28 +18,45 @@ create table TARGET.result stored as parquet as
|
||||||
select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
|
select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
|
||||||
union all
|
union all
|
||||||
select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
|
select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
|
||||||
'openorgs____::759d59f05d77188faee99b7493b46805',
|
'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
|
||||||
'openorgs____::b84450f9864182c67b8611b5593f4250',
|
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
|
||||||
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975',
|
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
|
||||||
'openorgs____::eadc8da90a546e98c03f896661a2e4d4',
|
'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
|
||||||
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2',
|
'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
|
||||||
'openorgs____::d169c7407dd417152596908d48c11460',
|
'openorgs____::2fb1e47b4612688d9de9169d579939a7', --University of Helsinki
|
||||||
'openorgs____::1ec924b1759bb16d0a02f2dad8689b21',
|
'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
|
||||||
'openorgs____::2fb1e47b4612688d9de9169d579939a7',
|
'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
|
||||||
'openorgs____::759d59f05d77188faee99b7493b46805',
|
'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
|
||||||
'openorgs____::cad284878801b9465fa51a95b1d779db',
|
'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
|
||||||
'openorgs____::eadc8da90a546e98c03f896661a2e4d4',
|
-- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
|
||||||
'openorgs____::c0286313e36479eff8676dba9b724b40'
|
'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
|
||||||
-- ,'openorgs____::c80a8243a5e5c620d7931c88d93bf17a' -- Paris Diderot
|
'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
|
||||||
) )) foo;
|
'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
|
||||||
|
'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
|
||||||
|
'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
|
||||||
|
'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
|
||||||
|
'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
|
||||||
|
'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
|
||||||
|
'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
|
||||||
|
'openorgs____::6445d7758d3a40c4d997953b6632a368' --National Institute of Informatics (NII)
|
||||||
|
) )) foo;
|
||||||
compute stats TARGET.result;
|
compute stats TARGET.result;
|
||||||
|
|
||||||
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_citations;
|
compute stats TARGET.result_citations;
|
||||||
|
|
||||||
|
create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
|
compute stats TARGET.result_references_oc;
|
||||||
|
|
||||||
|
create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
|
compute stats TARGET.result_citations_oc;
|
||||||
|
|
||||||
create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_classifications;
|
compute stats TARGET.result_classifications;
|
||||||
|
|
||||||
|
create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
|
compute stats TARGET.result_apc;
|
||||||
|
|
||||||
create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_concepts;
|
compute stats TARGET.result_concepts;
|
||||||
|
|
||||||
|
@ -90,11 +107,6 @@ compute stats TARGET.result_sources;
|
||||||
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_topics;
|
compute stats TARGET.result_topics;
|
||||||
|
|
||||||
create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
|
||||||
compute stats TARGET.result_apc;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
|
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
|
||||||
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
|
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
|
||||||
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
|
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
|
||||||
|
|
|
@ -127,6 +127,7 @@ CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS
|
||||||
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
|
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
|
||||||
FROM ${openaire_db_name}.relation r
|
FROM ${openaire_db_name}.relation r
|
||||||
WHERE r.reltype = 'resultOrganization'
|
WHERE r.reltype = 'resultOrganization'
|
||||||
|
and r.target like '50|%'
|
||||||
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
|
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS
|
CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS
|
||||||
|
|
|
@ -93,7 +93,7 @@ where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false;
|
||||||
CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS
|
CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS
|
||||||
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
|
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
|
||||||
FROM ${openaire_db_name}.relation r
|
FROM ${openaire_db_name}.relation r
|
||||||
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
|
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.target like '20|%' and r.datainfo.invisible=false;
|
||||||
|
|
||||||
-- datasource sources:
|
-- datasource sources:
|
||||||
-- where the datasource info have been collected from.
|
-- where the datasource info have been collected from.
|
||||||
|
|
Loading…
Reference in New Issue