From b7cd2c6ca1a377097775d9dcc7ed6cfebdbd9728 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Wed, 20 Apr 2022 14:46:55 +0300 Subject: [PATCH 01/14] added open citations --- .../graph/stats/oozie_app/scripts/step13.sql | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index a5839da11..aee66fd5e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -80,4 +80,34 @@ where reltype='resultResult' and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE and r1.resulttype.classname != 'other' and r2.resulttype.classname != 'other' - and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; \ No newline at end of file + and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; + +create table ${stats_db_name}.result_citations_oc stored as parquet as +select substr(target, 4) as id, count(distinct substr(source, 4)) as citations +from ${openaire_db_name}.relation rel +join ${openaire_db_name}.result r1 on rel.source=r1.id +join ${openaire_db_name}.result r2 on r2.id=rel.target +where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:crosswalk:opencitations' + and reltype='resultResult' + and r1.resulttype.classname!=r2.resulttype.classname + and r1.datainfo.deletedbyinference=false and r1.datainfo.invisible = FALSE + and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE + and r1.resulttype.classname != 'other' + and r2.resulttype.classname != 'other' + and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE +group by substr(target, 4); + +create table ${stats_db_name}.result_references_oc stored as parquet as +select substr(source, 4) as id, count(distinct substr(target, 4)) as references +from ${openaire_db_name}.relation rel + join ${openaire_db_name}.result r1 on rel.source=r1.id + join ${openaire_db_name}.result r2 on r2.id=rel.target +where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:crosswalk:opencitations' + and reltype='resultResult' + and r1.resulttype.classname!=r2.resulttype.classname + and r1.datainfo.deletedbyinference=false and r1.datainfo.invisible = FALSE + and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE + and r1.resulttype.classname != 'other' + and r2.resulttype.classname != 'other' + and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE +group by substr(source, 4); \ No newline at end of file From cfbbcaf7c42c85f8bd9095ecd97c5adadf049bdf Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Fri, 6 May 2022 12:49:36 +0300 Subject: [PATCH 02/14] commented out indi_result_org_country_collab --- .../scripts/step16-createIndicatorsTables.sql | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 24e6bff7e..09b24f741 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -82,18 +82,18 @@ on r.id= tmp.id; compute stats indi_funded_result_with_fundref; -create table indi_result_org_country_collab stored as parquet as -with tmp as -(select o.id as id, o.country , ro.id as result,r.type from organization o -join result_organization ro on o.id=ro.organization -join result r on r.id=ro.id where o.country <> 'UNKNOWN') -select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations -from tmp as o1 -join tmp as o2 on o1.result=o2.result -where o1.id<>o2.id and o1.country<>o2.country -group by o1.id, o1.type,o2.country; - -compute stats indi_result_org_country_collab; +-- create table indi_result_org_country_collab stored as parquet as +-- with tmp as +-- (select o.id as id, o.country , ro.id as result,r.type from organization o +-- join result_organization ro on o.id=ro.organization +-- join result r on r.id=ro.id where o.country <> 'UNKNOWN') +-- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations +-- from tmp as o1 +-- join tmp as o2 on o1.result=o2.result +-- where o1.id<>o2.id and o1.country<>o2.country +-- group by o1.id, o1.type,o2.country; +-- +-- compute stats indi_result_org_country_collab; create table indi_result_org_collab stored as parquet as with tmp as From 61b4c19e6554b7b9ed53d1d1966240ce956c1211 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Fri, 6 May 2022 12:52:10 +0300 Subject: [PATCH 03/14] restored indi_result_org_country_collab, removed indi_result_org_collab --- .../scripts/step16-createIndicatorsTables.sql | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 09b24f741..c40618510 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -82,31 +82,31 @@ on r.id= tmp.id; compute stats indi_funded_result_with_fundref; --- create table indi_result_org_country_collab stored as parquet as --- with tmp as --- (select o.id as id, o.country , ro.id as result,r.type from organization o --- join result_organization ro on o.id=ro.organization --- join result r on r.id=ro.id where o.country <> 'UNKNOWN') --- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations --- from tmp as o1 --- join tmp as o2 on o1.result=o2.result --- where o1.id<>o2.id and o1.country<>o2.country --- group by o1.id, o1.type,o2.country; --- --- compute stats indi_result_org_country_collab; - -create table indi_result_org_collab stored as parquet as +create table indi_result_org_country_collab stored as parquet as with tmp as -(select o.id, ro.id as result,r.type from organization o +(select o.id as id, o.country , ro.id as result,r.type from organization o join result_organization ro on o.id=ro.organization -join result r on r.id=ro.id) -select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations +join result r on r.id=ro.id where o.country <> 'UNKNOWN') +select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations from tmp as o1 join tmp as o2 on o1.result=o2.result -where o1.id<>o2.id -group by o1.id, o2.id, o1.type; +where o1.id<>o2.id and o1.country<>o2.country +group by o1.id, o1.type,o2.country; -compute stats indi_result_org_collab; +compute stats indi_result_org_country_collab; + +-- create table indi_result_org_collab stored as parquet as +-- with tmp as +-- (select o.id, ro.id as result,r.type from organization o +-- join result_organization ro on o.id=ro.organization +-- join result r on r.id=ro.id) +-- select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations +-- from tmp as o1 +-- join tmp as o2 on o1.result=o2.result +-- where o1.id<>o2.id +-- group by o1.id, o2.id, o1.type; +-- +-- compute stats indi_result_org_collab; create table indi_funder_country_collab stored as parquet as with tmp as (select funder, project, country from organization_projects op From 23334479bb7e5219ae14541e41abd7ee3903a3e2 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Wed, 11 May 2022 13:05:52 +0300 Subject: [PATCH 04/14] removed yet another collab, added more orgs in monitor --- .../scripts/step16-createIndicatorsTables.sql | 24 ++++++------- .../scripts/step20-createMonitorDB.sql | 35 +++++++++++-------- 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index c40618510..db40cf973 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -82,18 +82,18 @@ on r.id= tmp.id; compute stats indi_funded_result_with_fundref; -create table indi_result_org_country_collab stored as parquet as -with tmp as -(select o.id as id, o.country , ro.id as result,r.type from organization o -join result_organization ro on o.id=ro.organization -join result r on r.id=ro.id where o.country <> 'UNKNOWN') -select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations -from tmp as o1 -join tmp as o2 on o1.result=o2.result -where o1.id<>o2.id and o1.country<>o2.country -group by o1.id, o1.type,o2.country; - -compute stats indi_result_org_country_collab; +-- create table indi_result_org_country_collab stored as parquet as +-- with tmp as +-- (select o.id as id, o.country , ro.id as result,r.type from organization o +-- join result_organization ro on o.id=ro.organization +-- join result r on r.id=ro.id where o.country <> 'UNKNOWN') +-- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations +-- from tmp as o1 +-- join tmp as o2 on o1.result=o2.result +-- where o1.id<>o2.id and o1.country<>o2.country +-- group by o1.id, o1.type,o2.country; +-- +-- compute stats indi_result_org_country_collab; -- create table indi_result_org_collab stored as parquet as -- with tmp as diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index bcc9f0b5d..4dd434101 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -18,20 +18,27 @@ create table TARGET.result stored as parquet as select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) union all select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( - 'openorgs____::759d59f05d77188faee99b7493b46805', - 'openorgs____::b84450f9864182c67b8611b5593f4250', - 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', - 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', - 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', - 'openorgs____::d169c7407dd417152596908d48c11460', - 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', - 'openorgs____::2fb1e47b4612688d9de9169d579939a7', - 'openorgs____::759d59f05d77188faee99b7493b46805', - 'openorgs____::cad284878801b9465fa51a95b1d779db', - 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', - 'openorgs____::c0286313e36479eff8676dba9b724b40' - -- ,'openorgs____::c80a8243a5e5c620d7931c88d93bf17a' -- Paris Diderot - ) )) foo; + 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC" + 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council + 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ?? + 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University + 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade + 'openorgs____::2fb1e47b4612688d9de9169d579939a7', --University of Helsinki + 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho + 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid + 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen + 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens + -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot + 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University + 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark + 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin + 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt + 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven + 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape + 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute + 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University + 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg + 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII) ) )) foo; compute stats TARGET.result; create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); From c25134f28d8586591284a1740f8bd89b7cc0b0d5 Mon Sep 17 00:00:00 2001 From: antleb Date: Thu, 12 May 2022 14:55:47 +0300 Subject: [PATCH 05/14] fixed typo --- .../graph/stats/oozie_app/scripts/step20-createMonitorDB.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 4dd434101..2dde7171f 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -38,7 +38,8 @@ create table TARGET.result stored as parquet as 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg - 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII) ) )) foo; + 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII) + ) )) foo; compute stats TARGET.result; create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); From e4eac1d20bd8981939fe4efbbd94c26d680d6999 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 13 May 2022 11:01:33 +0200 Subject: [PATCH 06/14] [EOSC TAG] added code to remove EOSC Jupyter Notebook from subjects and put EOSC as classid in the qualifier --- .../java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java | 11 +++++++++-- .../dhp/sx/graph/SparkConvertRDDtoDataset.scala | 12 +++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java index 561e1d57e..e8c79e11d 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java @@ -30,7 +30,7 @@ public class SparkEoscTag { public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static final Qualifier EOSC_QUALIFIER = OafMapperUtils .qualifier( - "eosc", + "EOSC", "European Open Science Cloud", ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES); public static final DataInfo EOSC_DATAINFO = OafMapperUtils @@ -95,7 +95,14 @@ public class SparkEoscTag { if (containsCriteriaNotebook(s)) { sbject.add(EOSC_NOTEBOOK); - + if (sbject.stream().anyMatch(sb -> sb.getValue().equals("EOSC Jupyter Notebook"))){ + sbject = sbject.stream().map(sb -> { + if (sb.getValue().equals("EOSC Jupyter Notebook")){ + return null; + } + return sb; + }).filter(Objects::nonNull).collect(Collectors.toList()); + } } if (containsCriteriaGalaxy(s)) { sbject.add(EOSC_GALAXY); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala index 7c3a212ac..bd970a5cf 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala @@ -103,21 +103,19 @@ object SparkConvertRDDtoDataset { "IsAmongTopNSimilarDocuments" ) - val rddRelation = spark.sparkContext .textFile(s"$sourcePath/relation") .map(s => mapper.readValue(s, classOf[Relation])) .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false) .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) //filter OpenCitations relations - .filter(r => r.getCollectedfrom!= null && r.getCollectedfrom.size()>0 && !r.getCollectedfrom.asScala.exists(k => "opencitations".equalsIgnoreCase(k.getValue))) + .filter(r => + r.getCollectedfrom != null && r.getCollectedfrom.size() > 0 && !r.getCollectedfrom.asScala.exists(k => + "opencitations".equalsIgnoreCase(k.getValue) + ) + ) .filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))) spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") - - - - - } } From 3fc9efeab6559edc2fd0ad839473a6bbc03c89f5 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Fri, 13 May 2022 14:28:13 +0300 Subject: [PATCH 07/14] fixed typo, addded open citations and apcs in monitor --- .../oozie_app/scripts/step20-createMonitorDB.sql | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 4dd434101..3cf155869 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -38,15 +38,25 @@ create table TARGET.result stored as parquet as 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg - 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII) ) )) foo; + 'openorgs____::6445d7758d3a40c4d997953b6632a368' --National Institute of Informatics (NII) + ) )) foo; compute stats TARGET.result; create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_citations; +create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_references_oc; + +create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_citations_oc; + create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_classifications; +create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_apc; + create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_concepts; From 0dc33ea391f033578e890a8335dc28d36b8128d8 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 16 May 2022 09:20:30 +0200 Subject: [PATCH 08/14] [openorgs] fixed parent/child query, using the correct semantic labels --- .../dnetlib/dhp/oa/graph/sql/queryParentChildRelsOpenOrgs.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryParentChildRelsOpenOrgs.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryParentChildRelsOpenOrgs.sql index 388fee3f5..0ac843401 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryParentChildRelsOpenOrgs.sql +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryParentChildRelsOpenOrgs.sql @@ -10,4 +10,4 @@ SELECT 'OpenOrgs Database' AS collectedfromname, 'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction FROM relationships -WHERE reltype = 'Child' OR reltype = 'Parent' \ No newline at end of file +WHERE reltype = 'IsChildOf' OR reltype = 'IsParentOf' \ No newline at end of file From 4c50f35c8bc41530f9f2dec095a11f3b3b2eaf9b Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 16 May 2022 10:29:36 +0200 Subject: [PATCH 09/14] update publication Date format --- .../resources/eu/dnetlib/dhp/sx/provision/scholix_index.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/provision/scholix_index.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/provision/scholix_index.json index 93032712a..86e80206f 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/provision/scholix_index.json +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/provision/scholix_index.json @@ -41,7 +41,8 @@ } }, "publicationDate": { - "type": "keyword" + "type": "date", + "format": "yyyy-MM-dd" }, "relationship": { "properties": { From 997c50078e01e3d64897661cd40fb96421c5e466 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 16 May 2022 12:07:40 +0200 Subject: [PATCH 10/14] [graph grouping] drop relation target path before copying from source --- .../dnetlib/dhp/oa/graph/group/oozie_app/workflow.xml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group/oozie_app/workflow.xml index f77b46105..888a873c5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group/oozie_app/workflow.xml @@ -283,7 +283,15 @@ - + + + + + + + + + From d098ad0d9357e40dff1e0e2d8b07777dca5057a5 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 16 May 2022 15:53:27 +0200 Subject: [PATCH 11/14] [hb patch] updated map --- .../eu/dnetlib/dhp/datacite/hostedBy_map.json | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json index 947a9a255..ecae6811a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json @@ -15,7 +15,7 @@ "official_name": "Aperta TÜBİTAK Open Archive" }, "BL.CAM": { - "openaire_id": "re3data_____::r3d100010620", + "openaire_id": "opendoar____::109", "datacite_name": "Apollo", "official_name": "Apollo" }, @@ -196,7 +196,7 @@ }, "CSIC.DIGITAL": { "openaire_id": "re3data_____::r3d100011076", - "datacite_name": "DIGITAL.CSIC", + "datacite_name": "Digital CSIC", "official_name": "DIGITAL.CSIC" }, "BL.DRI": { @@ -644,6 +644,11 @@ "datacite_name": "PANGAEA", "official_name": "PANGAEA" }, + "TIB.PANGAEA": { + "openaire_id": "re3data_____::r3d100010134", + "datacite_name": "PANGAEA", + "official_name": "PANGAEA" + }, "NASAPDS.NASAPDS": { "openaire_id": "re3data_____::r3d100010121", "datacite_name": "PDS", @@ -896,7 +901,7 @@ }, "FIGSHARE.UCT": { "openaire_id": "re3data_____::r3d100012633", - "datacite_name": "ZivaHub", + "datacite_name": "University of Cape Town (UCT)", "official_name": "ZivaHub" }, "BL.UCLAN": { @@ -1030,9 +1035,9 @@ "official_name": "ZBW Journal Data Archive" }, "CERN.ZENODO": { - "openaire_id": "re3data_____::r3d100010468", + "openaire_id": "opendoar____::2659", "datacite_name": "Zenodo", - "official_name": "Zenodo" + "official_name": "ZENODO" }, "ZBW.ZEW": { "openaire_id": "re3data_____::r3d100010399", From f5207885e371e8e538069d2b301094bb0af185b0 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 17 May 2022 15:09:22 +0200 Subject: [PATCH 12/14] [EOSCTag] changed code to remove EOSC Jupyter Notebook and modified test to exclude galaxy + software from the tagging for Galaxy --- .../eu/dnetlib/dhp/bulktag/SparkEoscTag.java | 5 +++-- .../dnetlib/dhp/bulktag/EOSCTagJobTest.java | 21 ++++++------------- .../eosctag/jupyter/software/software_10.json | 2 +- 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java index e8c79e11d..b9de5dd11 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java @@ -95,13 +95,14 @@ public class SparkEoscTag { if (containsCriteriaNotebook(s)) { sbject.add(EOSC_NOTEBOOK); - if (sbject.stream().anyMatch(sb -> sb.getValue().equals("EOSC Jupyter Notebook"))){ + if (sbject.stream().anyMatch(sb -> sb.getValue().equals("EOSC Jupyter Notebook"))) { sbject = sbject.stream().map(sb -> { - if (sb.getValue().equals("EOSC Jupyter Notebook")){ + if (sb.getValue().equals("EOSC Jupyter Notebook")) { return null; } return sb; }).filter(Objects::nonNull).collect(Collectors.toList()); + s.setSubject(sbject); } } if (containsCriteriaGalaxy(s)) { diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java index 55d3939e1..1ea254157 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java @@ -132,7 +132,7 @@ public class EOSCTagJobTest { Assertions .assertEquals( - 2, tmp + 1, tmp .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .collect() .get(0) @@ -326,7 +326,7 @@ public class EOSCTagJobTest { Assertions .assertEquals( - 2, + 1, tmp .filter( s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) @@ -352,21 +352,12 @@ public class EOSCTagJobTest { Assertions .assertEquals( - 6, tmp + 5, tmp .filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")) .collect() .get(0) .getSubject() .size()); - Assertions - .assertTrue( - tmp - .filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")) - .collect() - .get(0) - .getSubject() - .stream() - .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); Assertions .assertEquals( @@ -394,7 +385,7 @@ public class EOSCTagJobTest { Assertions .assertEquals( - 2, + 1, orp .filter( s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) @@ -438,14 +429,14 @@ public class EOSCTagJobTest { Assertions .assertEquals( - 3, orp + 2, orp .filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72")) .collect() .get(0) .getSubject() .size()); Assertions - .assertTrue( + .assertFalse( orp .filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72")) .collect() diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/jupyter/software/software_10.json b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/jupyter/software/software_10.json index 2439dc1b6..2acc856a4 100644 --- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/jupyter/software/software_10.json +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/jupyter/software/software_10.json @@ -1,4 +1,4 @@ -{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1585055698387,"id":"50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4","originalId":["od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"],"collectedfrom":[{"key":"opendoar____::1582","value":"ProdInra","dataInfo":null}],"pid":[],"dateofcollection":"2019-01-24T16:45:07Z","dateoftransformation":"","extraInfo":[],"oaiprovenance":{"originDescription":{"harvestDate":"2019-01-23T18:34:35.459Z","altered":true,"baseURL":"http://oai.prodinra.inra.fr/ft","identifier":"oai:prodinra.inra.fr:442576","datestamp":"2018-11-12T00:00:00Z","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"author":[{"fullname":"Bitteur, Sylvaine","name":"Sylvaine","surname":"Bitteur","rank":1,"pid":null,"affiliation":null},{"fullname":"Hassouna, Melynda","name":"Melynda","surname":"Hassouna","rank":2,"pid":null,"affiliation":null}],"resulttype":{"classid":"software","classname":"software","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"country":[],"subject":[{"value": "jupyter Notebooks","qualifier": {"classid": "","classname": "","schemeid": "","schemename": "" },"dataInfo": {"invisible": false,"inferred": false,"deletedbyinference": false,"trust": "0.9","inferenceprovenance": "","provenanceaction": {"classid": "sysimport:crosswalk:repository","classname": "sysimport:crosswalk:repository","schemeid": "dnet:provenanceActions","schemename": "dnet:provenanceActions"}}}],"title":[{"value":"Charte graphique et site web International Symposium EmiLi 2012","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"relevantdate":[],"description":[],"dateofacceptance":{"value":"2012-01-01","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"publisher":null,"embargoenddate":null,"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":null,"coverage":[],"bestaccessright":null,"context":[{"id":"http://zenodo.org/communities/dimpo"}],"externalReference":[],"instance":[{"license":{"value":"https://creativecommons.org/licenses/by-nd/3.0/","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"instancetype":{"classid":"0029","classname":"Software","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"hostedby":{"key":"10|doajarticles::8cec81178926caaca531afbd8eb5d64c","value":"ProdInra","dataInfo":null},"url":["http://prodinra.inra.fr/record/442576"],"distributionlocation":"","collectedfrom":{"key":"opendoar____::1582","value":"ProdInra","dataInfo":null},"dateofacceptance":{"value":"2012-01-01","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"processingchargeamount":null,"processingchargecurrency":null,"refereed":null}],"documentationUrl":[],"license":[],"codeRepositoryUrl":null,"programmingLanguage":null} +{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1585055698387,"id":"50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4","originalId":["od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"],"collectedfrom":[{"key":"opendoar____::1582","value":"ProdInra","dataInfo":null}],"pid":[],"dateofcollection":"2019-01-24T16:45:07Z","dateoftransformation":"","extraInfo":[],"oaiprovenance":{"originDescription":{"harvestDate":"2019-01-23T18:34:35.459Z","altered":true,"baseURL":"http://oai.prodinra.inra.fr/ft","identifier":"oai:prodinra.inra.fr:442576","datestamp":"2018-11-12T00:00:00Z","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"author":[{"fullname":"Bitteur, Sylvaine","name":"Sylvaine","surname":"Bitteur","rank":1,"pid":null,"affiliation":null},{"fullname":"Hassouna, Melynda","name":"Melynda","surname":"Hassouna","rank":2,"pid":null,"affiliation":null}],"resulttype":{"classid":"software","classname":"software","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"country":[],"subject":[{"value": "EOSC Jupyter Notebook","qualifier": {"classid": "","classname": "","schemeid": "","schemename": "" },"dataInfo": {"invisible": false,"inferred": false,"deletedbyinference": false,"trust": "0.9","inferenceprovenance": "","provenanceaction": {"classid": "sysimport:crosswalk:repository","classname": "sysimport:crosswalk:repository","schemeid": "dnet:provenanceActions","schemename": "dnet:provenanceActions"}}}],"title":[{"value":"Charte graphique et site web International Symposium EmiLi 2012","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"relevantdate":[],"description":[],"dateofacceptance":{"value":"2012-01-01","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"publisher":null,"embargoenddate":null,"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":null,"coverage":[],"bestaccessright":null,"context":[{"id":"http://zenodo.org/communities/dimpo"}],"externalReference":[],"instance":[{"license":{"value":"https://creativecommons.org/licenses/by-nd/3.0/","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"instancetype":{"classid":"0029","classname":"Software","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"hostedby":{"key":"10|doajarticles::8cec81178926caaca531afbd8eb5d64c","value":"ProdInra","dataInfo":null},"url":["http://prodinra.inra.fr/record/442576"],"distributionlocation":"","collectedfrom":{"key":"opendoar____::1582","value":"ProdInra","dataInfo":null},"dateofacceptance":{"value":"2012-01-01","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"processingchargeamount":null,"processingchargecurrency":null,"refereed":null}],"documentationUrl":[],"license":[],"codeRepositoryUrl":null,"programmingLanguage":null} {"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1585055711745,"id":"50|od______1582::501b25d420f808c8eddcd9b16e917f11","originalId":["od______1582::501b25d420f808c8eddcd9b16e917f11"],"collectedfrom":[{"key":"opendoar____::1582","value":"ProdInra","dataInfo":null}],"pid":[],"dateofcollection":"2019-01-24T16:45:07Z","dateoftransformation":"","extraInfo":[],"oaiprovenance":{"originDescription":{"harvestDate":"2019-01-23T19:54:07.667Z","altered":true,"baseURL":"http://oai.prodinra.inra.fr/ft","identifier":"oai:prodinra.inra.fr:255703","datestamp":"2018-03-20T00:00:00Z","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"author":[{"fullname":"Petit R.J., Remy","name":"Remy","surname":"Petit R J","rank":1,"pid":null,"affiliation":null}],"resulttype":{"classid":"software","classname":"software","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"country":[],"subject":[{"value":"python","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"richesse allélique","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"gène polymorphe","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"loci","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"diversité des populations","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[{"value":"Contrib","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"relevantdate":[],"description":[{"value":"This program provides a measure of the contribution of each population to total diversity (measured by H, the expected heterozygosity, or by R, the allelic richness). This contribution is splitted in two components: one due to the diversity of the population, the other due to its differentiation from the remaining populations. The program may also be used to obtain allelic richness after rarefaction (to a sample size chosen by the investigator) for a set of populations. It can be used in conjunction with the program haplodiv, based on the paper by Pons & Petit 1995, TAG 90, 462-470, which will provide standard errors for the diversity and differentiation parameters. The input file is a text file (see example: rartest.txt), where the first line indicates the number of haplotypes (here it is 18), the number of populations (here it is 4), and the rarefaction size (it should not be larger than the smallest population sample size; here the rarefaction size is 10, and the smallest sample size is 20). Then follows the data for each population (line), with the number of each haplotype in each population (don't use relative frequencies): 18 4 10 1 0 1 0 0 0 1 1 ...(18 columns) 0 1 2 1 1 0 13 0 ... 0 0 8 0 0 3 6 0 ... 1 0 9 0 0 3 7 1 Results can be seen in the output file (rartest.out here; to be printed horizontally). General measures are given first: within population diversity (Hs), total diversity (Ht), and Gst are given, followed by similar measures based on allelic richness. Then you get the results for each population : H, its standard error, allelic richness after rarefaction, the divergence from the other populations (DHs, DHt, DGst, see the paper in Conservation Biology), and the contributions Ct, Cs, Cd followed by the contributions for allelic richness measures. The program is written for an haploid gene but may be used for nuclear genes, assuming Hardy-Weinberg equilibrium. How to proceed when there are several loci? Do not take the mean across Gst or across Contributions. They are ratios, so you should take the mean of the numerator and the mean of the denominator separately. For the denominator: take the mean of hT and Rt-1 across loci. For the numerator: multiply the contributions by hT or Rt-1 (respectively for contributions to diversity or to allelic richness) and take the mean of these products across loci. Then compute the ratio of the two means.","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"dateofacceptance":{"value":"2006-01-01","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"publisher":null,"embargoenddate":null,"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":null,"coverage":[],"bestaccessright":null,"context":[{"id":"http://zenodo.org/communities/covid_19_senacyt_abc_panama"},{"id":"covid-19"}],"externalReference":[],"instance":[{"license":{"value":"https://creativecommons.org/licenses/by-sa/3.0/","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"instancetype":{"classid":"0029","classname":"Software","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"hostedby":{"key":"10|opendoar____::87ae6fb631f7c8a627e8e28785d9992d","value":"ProdInra","dataInfo":null},"url":["http://prodinra.inra.fr/record/255703"],"distributionlocation":"","collectedfrom":{"key":"opendoar____::1582","value":"ProdInra","dataInfo":null},"dateofacceptance":{"value":"2006-01-01","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"processingchargeamount":null,"processingchargecurrency":null,"refereed":null}],"documentationUrl":[],"license":[],"codeRepositoryUrl":null,"programmingLanguage":null} {"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1585055718681,"id":"50|od______1582::581621232a561b7e8b4952b18b8b0e56","originalId":["od______1582::581621232a561b7e8b4952b18b8b0e56"],"collectedfrom":[{"key":"opendoar____::1582","value":"ProdInra","dataInfo":null}],"pid":[],"dateofcollection":"2019-01-24T16:45:07Z","dateoftransformation":"","extraInfo":[],"oaiprovenance":{"originDescription":{"harvestDate":"2019-01-23T19:54:07.673Z","altered":true,"baseURL":"http://oai.prodinra.inra.fr/ft","identifier":"oai:prodinra.inra.fr:255707","datestamp":"2018-03-20T00:00:00Z","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"author":[{"fullname":"Petit R.J., Remy","name":"Remy","surname":"Petit R J","rank":1,"pid":null,"affiliation":null}],"resulttype":{"classid":"software","classname":"software","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"country":[],"subject":[{"value":"python","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"notebook","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"gène polymorphe","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"différenciation génétique","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"loci","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"application informatique","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"diversité des populations","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"haploïde","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[{"value":"Logiciels Permut et cpSSR","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"relevantdate":[],"description":[{"value":"THE PROGRAM PERMUT AND THE PROGRAM CpSSR BECOME ONLY ONE PROGRAM.[br/] When you run the program you can choose if you want to use permut or CpSSR.[br/][br/] README PERMUT[br/] This program is based on the papers (Pons & Petit Genetics 1996, 144:1237-1245) and (Burban et al. 1999, Mol Ecol 8, 1593-1602).[br/] It computes measures of diversity and differenciation from haploid population genetic data, when a measure of the distance between haplotypes is available, and test whether the differentiation and diversity measures differ from the equivalent measures that do not take into account the distances between haplotypes (ie, that consider all haplotypes equally divergent).[br/] The source file should be an ASCII file (its name should have 8 characters maximum: 12345678.txt) and should include the following information:[br/] First line :[br/] Number of cytotypes Number of populations Number of characters distinguishing the variants (for instance number of polymorphic fragments, or of polymorphic nucleotide sites). The program asks for the number of permutations to be made.[br/] see the example (\\ExamplePermut\\input.txt and \\ExamplePermut\\output.out).[br/] Then follows the number of individuals having a given cytotype (column) in a given population (row). Finally, and without interruption, provide the table of character states for all haplotypes, where each line corresponds to one haplotype, and each column to a character. No column should be empty (no missing haplotype) and each population (row) should be composed of AT LEAST 3 individuals![br/] The output file provides permutated values of Nst in a single row, and the value of the last 5% and last 1%. The mean of the permutated values is also given and should be close to the Gst value (by construction). To test if the observed Nst value is larger than the Gst, we count how many permutated values are larger than the observed Nst. If you have 5% of the permutated values greater than the observed value of Nst, then your test is not significant, otherwise it is and you know the P-value. This is akin to testing if Gst = Nst.[br/] [br/] README CpSSR :[br/] It computes measures of diversity and differenciation from haploid population genetic data, when the difference in number of repeats between alleles is available, and tests whether the differentiation and diversity measures differ from the equivalent measures when the distances between haplotypes is not considered (ie, when all haplotypes are considered equally divergent). The source file should be an ASCII file (its name should have 8 characters maximum: 12345678.txt) and should include the following information:[br/] First line :[br/] Number of cytotypes Number of populations Number of cpSSR loci. The program asks for the number of permutations to be made. See the example (\\ExampleCpSSR\\input.txt and \\ExamplePermut\\CpSSR.out).[br/] Then follows the number of individuals having a given haplotype (column) in a given population (row). Finally, and without interruption, provide the table of length variant states for all haplotypes, where each line corresponds to one haplotype, and each column to a character. No column should be empty (no missing haplotype) and each population (row) should be composed of AT LEAST 3 individuals![br/] The output file provides permutated values of Rst in a single row, and the value of the last 5% and last 1%. The mean of the permutated values is also given and should be close to the Gst value (by construction). To test if the observed Rst value is larger than the Gst, you count how many permutated values are larger than the observed Rst. If you have 5% of the permutated values greater than the observed value of Rst, then your test is not significant, otherwise it is and you know the P-value. This is akin to testing if Gst = Rst. I usually go for a one-sided test (i.e. I test if Rst>Gst, and not Rst<>Gst).","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"dateofacceptance":{"value":"2012-01-01","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"publisher":null,"embargoenddate":null,"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":null,"coverage":[],"bestaccessright":null,"context":[{"id":"http://zenodo.org/communities/euromixproject"}],"externalReference":[],"instance":[{"license":{"value":"https://creativecommons.org/licenses/by-sa/3.0/","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"instancetype":{"classid":"0029","classname":"Software","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"hostedby":{"key":"10|opendoar____::fd4c2dc64ccb8496e6f1f94c85f30d06","value":"ProdInra","dataInfo":null},"url":["http://prodinra.inra.fr/record/255707"],"distributionlocation":"","collectedfrom":{"key":"opendoar____::1582","value":"ProdInra","dataInfo":null},"dateofacceptance":{"value":"2012-01-01","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"processingchargeamount":null,"processingchargecurrency":null,"refereed":null}],"documentationUrl":[],"license":[],"codeRepositoryUrl":null,"programmingLanguage":null} {"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1585055721330,"id":"50|od______1582::5aec1186054301b66c0c5dc35972a589","originalId":["od______1582::5aec1186054301b66c0c5dc35972a589"],"collectedfrom":[{"key":"opendoar____::1582","value":"ProdInra","dataInfo":null}],"pid":[],"dateofcollection":"2019-01-24T16:45:07Z","dateoftransformation":"","extraInfo":[],"oaiprovenance":{"originDescription":{"harvestDate":"2019-01-23T18:54:28.567Z","altered":true,"baseURL":"http://oai.prodinra.inra.fr/ft","identifier":"oai:prodinra.inra.fr:402973","datestamp":"2018-03-19T00:00:00Z","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"author":[{"fullname":"Muratorio, Sylvie","name":"Sylvie","surname":"Muratorio","rank":1,"pid":null,"affiliation":null}],"resulttype":{"classid":"software","classname":"software","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"country":[],"subject":[{"value":"notebook","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"modèle physiologique","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"approche génétique","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"castanea","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"fagus sylvatica","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[{"value":"PDG Documentation, version 2","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"relevantdate":[],"description":[],"dateofacceptance":{"value":"2017-01-01","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"publisher":null,"embargoenddate":null,"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":null,"coverage":[],"bestaccessright":null,"context":[],"externalReference":[],"instance":[{"license":{"value":"https://creativecommons.org/licenses/by-sa/3.0/","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"instancetype":{"classid":"0029","classname":"Software","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"hostedby":{"key":"opendoar____::1582","value":"ProdInra","dataInfo":null},"url":["http://prodinra.inra.fr/record/402973"],"distributionlocation":"","collectedfrom":{"key":"opendoar____::1582","value":"ProdInra","dataInfo":null},"dateofacceptance":{"value":"2017-01-01","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"processingchargeamount":null,"processingchargecurrency":null,"refereed":null}],"documentationUrl":[],"license":[],"codeRepositoryUrl":null,"programmingLanguage":null} From c298c148cb285e73fab8079d63f6bbef4670f653 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 20 May 2022 09:11:46 +0200 Subject: [PATCH 13/14] [CountryPropagation] fix NPE issue --- .../SparkCountryPropagationJob.java | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java index 56aa953b4..25cd82248 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java @@ -102,21 +102,27 @@ public class SparkCountryPropagationJob { private static MapFunction, R> getCountryMergeFn() { return t -> { Optional.ofNullable(t._2()).ifPresent(r -> { - t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet())); + if(Optional.ofNullable(t._1().getCountry()).isPresent()) + t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet())); + else + t._1().setCountry(merge(null, t._2().getCountrySet())); }); return t._1(); }; } private static List merge(List c1, List c2) { - HashSet countries = c1 - .stream() - .map(Qualifier::getClassid) - .collect(Collectors.toCollection(HashSet::new)); + HashSet countries = new HashSet<>(); + if(Optional.ofNullable(c1).isPresent()){ + countries = c1.stream().map(Qualifier::getClassid) + .collect(Collectors.toCollection(HashSet::new)); + } + + HashSet finalCountries = countries; return c2 .stream() - .filter(c -> !countries.contains(c.getClassid())) + .filter(c -> !finalCountries.contains(c.getClassid())) .map(c -> getCountry(c.getClassid(), c.getClassname())) .collect(Collectors.toList()); } From 5e0b8f9b5fa27dc7e595c65753a682b12ec3e553 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 20 May 2022 09:15:53 +0200 Subject: [PATCH 14/14] [CountryPropagation] refactoring --- .../SparkCountryPropagationJob.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java index 25cd82248..d9f6433a0 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java @@ -102,7 +102,7 @@ public class SparkCountryPropagationJob { private static MapFunction, R> getCountryMergeFn() { return t -> { Optional.ofNullable(t._2()).ifPresent(r -> { - if(Optional.ofNullable(t._1().getCountry()).isPresent()) + if (Optional.ofNullable(t._1().getCountry()).isPresent()) t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet())); else t._1().setCountry(merge(null, t._2().getCountrySet())); @@ -113,11 +113,12 @@ public class SparkCountryPropagationJob { private static List merge(List c1, List c2) { HashSet countries = new HashSet<>(); - if(Optional.ofNullable(c1).isPresent()){ - countries = c1.stream().map(Qualifier::getClassid) - .collect(Collectors.toCollection(HashSet::new)); - } - + if (Optional.ofNullable(c1).isPresent()) { + countries = c1 + .stream() + .map(Qualifier::getClassid) + .collect(Collectors.toCollection(HashSet::new)); + } HashSet finalCountries = countries; return c2