mergin with branch beta

This commit is contained in:
Miriam Baglioni 2022-05-24 18:37:32 +02:00
commit 108e17644e
12 changed files with 143 additions and 78 deletions

View File

@ -15,7 +15,7 @@
"official_name": "Aperta TÜBİTAK Open Archive" "official_name": "Aperta TÜBİTAK Open Archive"
}, },
"BL.CAM": { "BL.CAM": {
"openaire_id": "re3data_____::r3d100010620", "openaire_id": "opendoar____::109",
"datacite_name": "Apollo", "datacite_name": "Apollo",
"official_name": "Apollo" "official_name": "Apollo"
}, },
@ -196,7 +196,7 @@
}, },
"CSIC.DIGITAL": { "CSIC.DIGITAL": {
"openaire_id": "re3data_____::r3d100011076", "openaire_id": "re3data_____::r3d100011076",
"datacite_name": "DIGITAL.CSIC", "datacite_name": "Digital CSIC",
"official_name": "DIGITAL.CSIC" "official_name": "DIGITAL.CSIC"
}, },
"BL.DRI": { "BL.DRI": {
@ -644,6 +644,11 @@
"datacite_name": "PANGAEA", "datacite_name": "PANGAEA",
"official_name": "PANGAEA" "official_name": "PANGAEA"
}, },
"TIB.PANGAEA": {
"openaire_id": "re3data_____::r3d100010134",
"datacite_name": "PANGAEA",
"official_name": "PANGAEA"
},
"NASAPDS.NASAPDS": { "NASAPDS.NASAPDS": {
"openaire_id": "re3data_____::r3d100010121", "openaire_id": "re3data_____::r3d100010121",
"datacite_name": "PDS", "datacite_name": "PDS",
@ -896,7 +901,7 @@
}, },
"FIGSHARE.UCT": { "FIGSHARE.UCT": {
"openaire_id": "re3data_____::r3d100012633", "openaire_id": "re3data_____::r3d100012633",
"datacite_name": "ZivaHub", "datacite_name": "University of Cape Town (UCT)",
"official_name": "ZivaHub" "official_name": "ZivaHub"
}, },
"BL.UCLAN": { "BL.UCLAN": {
@ -1030,9 +1035,9 @@
"official_name": "ZBW Journal Data Archive" "official_name": "ZBW Journal Data Archive"
}, },
"CERN.ZENODO": { "CERN.ZENODO": {
"openaire_id": "re3data_____::r3d100010468", "openaire_id": "opendoar____::2659",
"datacite_name": "Zenodo", "datacite_name": "Zenodo",
"official_name": "Zenodo" "official_name": "ZENODO"
}, },
"ZBW.ZEW": { "ZBW.ZEW": {
"openaire_id": "re3data_____::r3d100010399", "openaire_id": "re3data_____::r3d100010399",

View File

@ -30,7 +30,7 @@ public class SparkEoscTag {
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static final Qualifier EOSC_QUALIFIER = OafMapperUtils public static final Qualifier EOSC_QUALIFIER = OafMapperUtils
.qualifier( .qualifier(
"eosc", "EOSC",
"European Open Science Cloud", "European Open Science Cloud",
ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES); ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES);
public static final DataInfo EOSC_DATAINFO = OafMapperUtils public static final DataInfo EOSC_DATAINFO = OafMapperUtils
@ -95,7 +95,15 @@ public class SparkEoscTag {
if (containsCriteriaNotebook(s)) { if (containsCriteriaNotebook(s)) {
sbject.add(EOSC_NOTEBOOK); sbject.add(EOSC_NOTEBOOK);
if (sbject.stream().anyMatch(sb -> sb.getValue().equals("EOSC Jupyter Notebook"))) {
sbject = sbject.stream().map(sb -> {
if (sb.getValue().equals("EOSC Jupyter Notebook")) {
return null;
}
return sb;
}).filter(Objects::nonNull).collect(Collectors.toList());
s.setSubject(sbject);
}
} }
if (containsCriteriaGalaxy(s)) { if (containsCriteriaGalaxy(s)) {
sbject.add(EOSC_GALAXY); sbject.add(EOSC_GALAXY);

View File

@ -102,21 +102,28 @@ public class SparkCountryPropagationJob {
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() { private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {
return t -> { return t -> {
Optional.ofNullable(t._2()).ifPresent(r -> { Optional.ofNullable(t._2()).ifPresent(r -> {
t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet())); if (Optional.ofNullable(t._1().getCountry()).isPresent())
t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet()));
else
t._1().setCountry(merge(null, t._2().getCountrySet()));
}); });
return t._1(); return t._1();
}; };
} }
private static List<Country> merge(List<Country> c1, List<CountrySbs> c2) { private static List<Country> merge(List<Country> c1, List<CountrySbs> c2) {
HashSet<String> countries = c1 HashSet<String> countries = new HashSet<>();
.stream() if (Optional.ofNullable(c1).isPresent()) {
.map(Qualifier::getClassid) countries = c1
.collect(Collectors.toCollection(HashSet::new)); .stream()
.map(Qualifier::getClassid)
.collect(Collectors.toCollection(HashSet::new));
}
HashSet<String> finalCountries = countries;
return c2 return c2
.stream() .stream()
.filter(c -> !countries.contains(c.getClassid())) .filter(c -> !finalCountries.contains(c.getClassid()))
.map(c -> getCountry(c.getClassid(), c.getClassname())) .map(c -> getCountry(c.getClassid(), c.getClassname()))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }

View File

@ -132,7 +132,7 @@ public class EOSCTagJobTest {
Assertions Assertions
.assertEquals( .assertEquals(
2, tmp 1, tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect() .collect()
.get(0) .get(0)
@ -326,7 +326,7 @@ public class EOSCTagJobTest {
Assertions Assertions
.assertEquals( .assertEquals(
2, 1,
tmp tmp
.filter( .filter(
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
@ -352,21 +352,12 @@ public class EOSCTagJobTest {
Assertions Assertions
.assertEquals( .assertEquals(
6, tmp 5, tmp
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")) .filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
.collect() .collect()
.get(0) .get(0)
.getSubject() .getSubject()
.size()); .size());
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions Assertions
.assertEquals( .assertEquals(
@ -394,7 +385,7 @@ public class EOSCTagJobTest {
Assertions Assertions
.assertEquals( .assertEquals(
2, 1,
orp orp
.filter( .filter(
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
@ -438,14 +429,14 @@ public class EOSCTagJobTest {
Assertions Assertions
.assertEquals( .assertEquals(
3, orp 2, orp
.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72")) .filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
.collect() .collect()
.get(0) .get(0)
.getSubject() .getSubject()
.size()); .size());
Assertions Assertions
.assertTrue( .assertFalse(
orp orp
.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72")) .filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
.collect() .collect()

View File

@ -283,7 +283,15 @@
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<join name="wait_dispatch" to="copy_relation"/> <join name="wait_dispatch" to="delete_target_relation"/>
<action name="delete_target_relation">
<fs>
<delete path="${nameNode}/${graphOutputPath}/relation"/>
</fs>
<ok to="copy_relation"/>
<error to="Kill"/>
</action>
<action name="copy_relation"> <action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">

View File

@ -10,4 +10,4 @@ SELECT
'OpenOrgs Database' AS collectedfromname, 'OpenOrgs Database' AS collectedfromname,
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction 'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction
FROM relationships FROM relationships
WHERE reltype = 'Child' OR reltype = 'Parent' WHERE reltype = 'IsChildOf' OR reltype = 'IsParentOf'

View File

@ -103,21 +103,19 @@ object SparkConvertRDDtoDataset {
"IsAmongTopNSimilarDocuments" "IsAmongTopNSimilarDocuments"
) )
val rddRelation = spark.sparkContext val rddRelation = spark.sparkContext
.textFile(s"$sourcePath/relation") .textFile(s"$sourcePath/relation")
.map(s => mapper.readValue(s, classOf[Relation])) .map(s => mapper.readValue(s, classOf[Relation]))
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false) .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
//filter OpenCitations relations //filter OpenCitations relations
.filter(r => r.getCollectedfrom!= null && r.getCollectedfrom.size()>0 && !r.getCollectedfrom.asScala.exists(k => "opencitations".equalsIgnoreCase(k.getValue))) .filter(r =>
r.getCollectedfrom != null && r.getCollectedfrom.size() > 0 && !r.getCollectedfrom.asScala.exists(k =>
"opencitations".equalsIgnoreCase(k.getValue)
)
)
.filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))) .filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
} }
} }

View File

@ -41,7 +41,8 @@
} }
}, },
"publicationDate": { "publicationDate": {
"type": "keyword" "type": "date",
"format": "yyyy-MM-dd"
}, },
"relationship": { "relationship": {
"properties": { "properties": {

View File

@ -81,3 +81,33 @@ where reltype='resultResult'
and r1.resulttype.classname != 'other' and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other' and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE;
create table ${stats_db_name}.result_citations_oc stored as parquet as
select substr(target, 4) as id, count(distinct substr(source, 4)) as citations
from ${openaire_db_name}.relation rel
join ${openaire_db_name}.result r1 on rel.source=r1.id
join ${openaire_db_name}.result r2 on r2.id=rel.target
where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:crosswalk:opencitations'
and reltype='resultResult'
and r1.resulttype.classname!=r2.resulttype.classname
and r1.datainfo.deletedbyinference=false and r1.datainfo.invisible = FALSE
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
group by substr(target, 4);
create table ${stats_db_name}.result_references_oc stored as parquet as
select substr(source, 4) as id, count(distinct substr(target, 4)) as references
from ${openaire_db_name}.relation rel
join ${openaire_db_name}.result r1 on rel.source=r1.id
join ${openaire_db_name}.result r2 on r2.id=rel.target
where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:crosswalk:opencitations'
and reltype='resultResult'
and r1.resulttype.classname!=r2.resulttype.classname
and r1.datainfo.deletedbyinference=false and r1.datainfo.invisible = FALSE
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
group by substr(source, 4);

View File

@ -82,31 +82,31 @@ on r.id= tmp.id;
compute stats indi_funded_result_with_fundref; compute stats indi_funded_result_with_fundref;
create table indi_result_org_country_collab stored as parquet as -- create table indi_result_org_country_collab stored as parquet as
with tmp as -- with tmp as
(select o.id as id, o.country , ro.id as result,r.type from organization o -- (select o.id as id, o.country , ro.id as result,r.type from organization o
join result_organization ro on o.id=ro.organization -- join result_organization ro on o.id=ro.organization
join result r on r.id=ro.id where o.country <> 'UNKNOWN') -- join result r on r.id=ro.id where o.country <> 'UNKNOWN')
select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations -- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
from tmp as o1 -- from tmp as o1
join tmp as o2 on o1.result=o2.result -- join tmp as o2 on o1.result=o2.result
where o1.id<>o2.id and o1.country<>o2.country -- where o1.id<>o2.id and o1.country<>o2.country
group by o1.id, o1.type,o2.country; -- group by o1.id, o1.type,o2.country;
--
-- compute stats indi_result_org_country_collab;
compute stats indi_result_org_country_collab; -- create table indi_result_org_collab stored as parquet as
-- with tmp as
create table indi_result_org_collab stored as parquet as -- (select o.id, ro.id as result,r.type from organization o
with tmp as -- join result_organization ro on o.id=ro.organization
(select o.id, ro.id as result,r.type from organization o -- join result r on r.id=ro.id)
join result_organization ro on o.id=ro.organization -- select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations
join result r on r.id=ro.id) -- from tmp as o1
select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations -- join tmp as o2 on o1.result=o2.result
from tmp as o1 -- where o1.id<>o2.id
join tmp as o2 on o1.result=o2.result -- group by o1.id, o2.id, o1.type;
where o1.id<>o2.id --
group by o1.id, o2.id, o1.type; -- compute stats indi_result_org_collab;
compute stats indi_result_org_collab;
create table indi_funder_country_collab stored as parquet as create table indi_funder_country_collab stored as parquet as
with tmp as (select funder, project, country from organization_projects op with tmp as (select funder, project, country from organization_projects op

View File

@ -18,28 +18,45 @@ create table TARGET.result stored as parquet as
select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
union all union all
select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
'openorgs____::759d59f05d77188faee99b7493b46805', 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
'openorgs____::b84450f9864182c67b8611b5593f4250', 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
'openorgs____::eadc8da90a546e98c03f896661a2e4d4', 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
'openorgs____::d169c7407dd417152596908d48c11460', 'openorgs____::2fb1e47b4612688d9de9169d579939a7', --University of Helsinki
'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
'openorgs____::2fb1e47b4612688d9de9169d579939a7', 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
'openorgs____::759d59f05d77188faee99b7493b46805', 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
'openorgs____::cad284878801b9465fa51a95b1d779db', 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
'openorgs____::eadc8da90a546e98c03f896661a2e4d4', -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
'openorgs____::c0286313e36479eff8676dba9b724b40' 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
-- ,'openorgs____::c80a8243a5e5c620d7931c88d93bf17a' -- Paris Diderot 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
) )) foo; 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
'openorgs____::6445d7758d3a40c4d997953b6632a368' --National Institute of Informatics (NII)
) )) foo;
compute stats TARGET.result; compute stats TARGET.result;
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_citations; compute stats TARGET.result_citations;
create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_references_oc;
create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_citations_oc;
create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_classifications; compute stats TARGET.result_classifications;
create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_apc;
create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_concepts; compute stats TARGET.result_concepts;