mergin with branch beta

2022-05-24 18:37:32 +02:00 · 2022-05-24 18:37:32 +02:00 · 108e17644e
parent ba642d53ff 5c2949a864
commit 108e17644e
12 changed files with 143 additions and 78 deletions
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json
@ -15,7 +15,7 @@
  "official_name": "Aperta TÜBİTAK Open Archive"
 },
 "BL.CAM": {
-  "openaire_id": "re3data_____::r3d100010620",
+  "openaire_id": "opendoar____::109",
  "datacite_name": "Apollo",
  "official_name": "Apollo"
 },
@ -196,7 +196,7 @@
 },
 "CSIC.DIGITAL": {
  "openaire_id": "re3data_____::r3d100011076",
-  "datacite_name": "DIGITAL.CSIC",
+  "datacite_name": "Digital CSIC",
  "official_name": "DIGITAL.CSIC"
 },
 "BL.DRI": {
@ -644,6 +644,11 @@
  "datacite_name": "PANGAEA",
  "official_name": "PANGAEA"
 },
 "TIB.PANGAEA": {
  "openaire_id": "re3data_____::r3d100010134",
  "datacite_name": "PANGAEA",
  "official_name": "PANGAEA"
 },
 "NASAPDS.NASAPDS": {
  "openaire_id": "re3data_____::r3d100010121",
  "datacite_name": "PDS",
@ -896,7 +901,7 @@
 },
 "FIGSHARE.UCT": {
  "openaire_id": "re3data_____::r3d100012633",
-  "datacite_name": "ZivaHub",
+  "datacite_name": "University of Cape Town (UCT)",
  "official_name": "ZivaHub"
 },
 "BL.UCLAN": {
@ -1030,9 +1035,9 @@
  "official_name": "ZBW Journal Data Archive"
 },
 "CERN.ZENODO": {
-  "openaire_id": "re3data_____::r3d100010468",
+  "openaire_id": "opendoar____::2659",
  "datacite_name": "Zenodo",
-  "official_name": "Zenodo"
+  "official_name": "ZENODO"
 },
 "ZBW.ZEW": {
  "openaire_id": "re3data_____::r3d100010399",
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java
@ -30,7 +30,7 @@ public class SparkEoscTag {
 	public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	public static final Qualifier EOSC_QUALIFIER = OafMapperUtils
 		.qualifier(
-			"eosc",
+			"EOSC",
 			"European Open Science Cloud",
 			ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES);
 	public static final DataInfo EOSC_DATAINFO = OafMapperUtils
@ -95,7 +95,15 @@ public class SparkEoscTag {
 				if (containsCriteriaNotebook(s)) {
 					sbject.add(EOSC_NOTEBOOK);
-
+					if (sbject.stream().anyMatch(sb -> sb.getValue().equals("EOSC Jupyter Notebook"))) {
 						sbject = sbject.stream().map(sb -> {
 							if (sb.getValue().equals("EOSC Jupyter Notebook")) {
 								return null;
 							}
 							return sb;
 						}).filter(Objects::nonNull).collect(Collectors.toList());
 						s.setSubject(sbject);
 					}
 				}
 				if (containsCriteriaGalaxy(s)) {
 					sbject.add(EOSC_GALAXY);
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java
@ -102,21 +102,28 @@ public class SparkCountryPropagationJob {
 	private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {
 		return t -> {
 			Optional.ofNullable(t._2()).ifPresent(r -> {
-				t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet()));
+				if (Optional.ofNullable(t._1().getCountry()).isPresent())
 					t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet()));
 				else
 					t._1().setCountry(merge(null, t._2().getCountrySet()));
 			});
 			return t._1();
 		};
 	}
 	private static List<Country> merge(List<Country> c1, List<CountrySbs> c2) {
-		HashSet<String> countries = c1
+		HashSet<String> countries = new HashSet<>();
-			.stream()
+		if (Optional.ofNullable(c1).isPresent()) {
-			.map(Qualifier::getClassid)
+			countries = c1
-			.collect(Collectors.toCollection(HashSet::new));
+				.stream()
 				.map(Qualifier::getClassid)
 				.collect(Collectors.toCollection(HashSet::new));
 		}
 		HashSet<String> finalCountries = countries;
 		return c2
 			.stream()
-			.filter(c -> !countries.contains(c.getClassid()))
+			.filter(c -> !finalCountries.contains(c.getClassid()))
 			.map(c -> getCountry(c.getClassid(), c.getClassname()))
 			.collect(Collectors.toList());
 	}
--- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java
+++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java
@ -132,7 +132,7 @@ public class EOSCTagJobTest {
 		Assertions
 			.assertEquals(
-				2, tmp
+				1, tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
 					.collect()
 					.get(0)
@ -326,7 +326,7 @@ public class EOSCTagJobTest {
 		Assertions
 			.assertEquals(
-				2,
+				1,
 				tmp
 					.filter(
 						s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
@ -352,21 +352,12 @@ public class EOSCTagJobTest {
 		Assertions
 			.assertEquals(
-				6, tmp
+				5, tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
 			.assertTrue(
 				tmp
 					.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.stream()
 					.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
 		Assertions
 			.assertEquals(
@ -394,7 +385,7 @@ public class EOSCTagJobTest {
 		Assertions
 			.assertEquals(
-				2,
+				1,
 				orp
 					.filter(
 						s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
@ -438,14 +429,14 @@ public class EOSCTagJobTest {
 		Assertions
 			.assertEquals(
-				3, orp
+				2, orp
 					.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
 					.collect()
 					.get(0)
 					.getSubject()
 					.size());
 		Assertions
-			.assertTrue(
+			.assertFalse(
 				orp
 					.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
 					.collect()
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/jupyter/software/software_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/eosctag/jupyter/software/software_10.json
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/group/oozie_app/workflow.xml
@ -283,7 +283,15 @@
        <error to="Kill"/>
    </action>
-    <join name="wait_dispatch" to="copy_relation"/>
+    <join name="wait_dispatch" to="delete_target_relation"/>
    <action name="delete_target_relation">
        <fs>
            <delete path="${nameNode}/${graphOutputPath}/relation"/>
        </fs>
        <ok to="copy_relation"/>
        <error to="Kill"/>
    </action>
    <action name="copy_relation">
        <distcp xmlns="uri:oozie:distcp-action:0.2">
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryParentChildRelsOpenOrgs.sql
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryParentChildRelsOpenOrgs.sql
@ -10,4 +10,4 @@ SELECT
 	'OpenOrgs Database'                                            AS collectedfromname,
 	'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction
 FROM relationships
-WHERE reltype = 'Child' OR reltype = 'Parent'
+WHERE reltype = 'IsChildOf' OR reltype = 'IsParentOf'
--- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala
@ -103,21 +103,19 @@ object SparkConvertRDDtoDataset {
      "IsAmongTopNSimilarDocuments"
    )
    val rddRelation = spark.sparkContext
      .textFile(s"$sourcePath/relation")
      .map(s => mapper.readValue(s, classOf[Relation]))
      .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
      .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
      //filter OpenCitations relations
-      .filter(r => r.getCollectedfrom!= null && r.getCollectedfrom.size()>0 && !r.getCollectedfrom.asScala.exists(k => "opencitations".equalsIgnoreCase(k.getValue)))
+      .filter(r =>
        r.getCollectedfrom != null && r.getCollectedfrom.size() > 0 && !r.getCollectedfrom.asScala.exists(k =>
          "opencitations".equalsIgnoreCase(k.getValue)
        )
      )
      .filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
    spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
  }
 }
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/provision/scholix_index.json
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/provision/scholix_index.json
@ -41,7 +41,8 @@
        }
      },
      "publicationDate": {
-        "type": "keyword"
+        "type":   "date",
        "format": "yyyy-MM-dd"
      },
      "relationship": {
        "properties": {
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql
@ -80,4 +80,34 @@ where reltype='resultResult'
    and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
    and r1.resulttype.classname != 'other'
    and r2.resulttype.classname != 'other'
-    and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE;
+    and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE;
 create table ${stats_db_name}.result_citations_oc stored as parquet as
 select substr(target, 4) as id, count(distinct substr(source, 4)) as citations
 from ${openaire_db_name}.relation rel
 join ${openaire_db_name}.result r1 on rel.source=r1.id
 join ${openaire_db_name}.result r2 on r2.id=rel.target
 where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:crosswalk:opencitations'
    and reltype='resultResult'
    and r1.resulttype.classname!=r2.resulttype.classname
    and r1.datainfo.deletedbyinference=false and r1.datainfo.invisible = FALSE
    and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
    and r1.resulttype.classname != 'other'
    and r2.resulttype.classname != 'other'
    and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
 group by substr(target, 4);
 create table ${stats_db_name}.result_references_oc stored as parquet as
 select substr(source, 4) as id, count(distinct substr(target, 4)) as references
 from ${openaire_db_name}.relation rel
         join ${openaire_db_name}.result r1 on rel.source=r1.id
         join ${openaire_db_name}.result r2 on r2.id=rel.target
 where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:crosswalk:opencitations'
  and reltype='resultResult'
  and r1.resulttype.classname!=r2.resulttype.classname
    and r1.datainfo.deletedbyinference=false and r1.datainfo.invisible = FALSE
    and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
    and r1.resulttype.classname != 'other'
    and r2.resulttype.classname != 'other'
    and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
 group by substr(source, 4);
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql
@ -82,31 +82,31 @@ on r.id= tmp.id;
 compute stats indi_funded_result_with_fundref;
-create table indi_result_org_country_collab stored as parquet as
+-- create table indi_result_org_country_collab stored as parquet as
-with tmp as
+-- with tmp as
-(select o.id as id, o.country , ro.id as result,r.type  from organization o
+-- (select o.id as id, o.country , ro.id as result,r.type  from organization o
-join result_organization ro on o.id=ro.organization
+-- join result_organization ro on o.id=ro.organization
-join result r on r.id=ro.id where o.country <> 'UNKNOWN')
+-- join result r on r.id=ro.id where o.country <> 'UNKNOWN')
-select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
+-- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
-from tmp as o1
+-- from tmp as o1
-join tmp as o2 on o1.result=o2.result
+-- join tmp as o2 on o1.result=o2.result
-where o1.id<>o2.id and o1.country<>o2.country
+-- where o1.id<>o2.id and o1.country<>o2.country
-group by o1.id, o1.type,o2.country;
+-- group by o1.id, o1.type,o2.country;
 --
 -- compute stats indi_result_org_country_collab;
-compute stats indi_result_org_country_collab;
+-- create table indi_result_org_collab stored as parquet as
-
+-- with tmp as
-create table indi_result_org_collab stored as parquet as
+-- (select o.id, ro.id as result,r.type  from organization o
-with tmp as
+-- join result_organization ro on o.id=ro.organization
-(select o.id, ro.id as result,r.type  from organization o
+-- join result r on r.id=ro.id)
-join result_organization ro on o.id=ro.organization
+-- select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations
-join result r on r.id=ro.id)
+-- from tmp as o1
-select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations
+-- join tmp as o2 on o1.result=o2.result
-from tmp as o1
+-- where o1.id<>o2.id
-join tmp as o2 on o1.result=o2.result
+-- group by o1.id, o2.id, o1.type;
-where o1.id<>o2.id
+--
-group by o1.id, o2.id, o1.type;
+-- compute stats indi_result_org_collab;
 compute stats indi_result_org_collab;
 create table indi_funder_country_collab stored as parquet as
 with tmp as (select funder, project, country from organization_projects op
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql
@ -18,28 +18,45 @@ create table TARGET.result stored as parquet as
        select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
        union all
        select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
-            'openorgs____::759d59f05d77188faee99b7493b46805',
+             'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
-            'openorgs____::b84450f9864182c67b8611b5593f4250',
+             'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
-            'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975',
+             'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
-            'openorgs____::eadc8da90a546e98c03f896661a2e4d4',
+             'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
-            'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2',
+             'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
-            'openorgs____::d169c7407dd417152596908d48c11460',
+             'openorgs____::2fb1e47b4612688d9de9169d579939a7', --University of Helsinki
-            'openorgs____::1ec924b1759bb16d0a02f2dad8689b21',
+             'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
-            'openorgs____::2fb1e47b4612688d9de9169d579939a7',
+             'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
-            'openorgs____::759d59f05d77188faee99b7493b46805',
+             'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
-            'openorgs____::cad284878801b9465fa51a95b1d779db',
+             'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
-            'openorgs____::eadc8da90a546e98c03f896661a2e4d4',
+             -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
-            'openorgs____::c0286313e36479eff8676dba9b724b40'
+             'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
-            -- ,'openorgs____::c80a8243a5e5c620d7931c88d93bf17a' -- Paris Diderot
+             'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
-            ) )) foo;
+             'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
             'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
             'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
             'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
             'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
             'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
             'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
             'openorgs____::6445d7758d3a40c4d997953b6632a368' --National Institute of Informatics (NII)
        ) )) foo;
 compute stats TARGET.result;
 create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
 compute stats TARGET.result_citations;
 create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
 compute stats TARGET.result_references_oc;
 create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
 compute stats TARGET.result_citations_oc;
 create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
 compute stats TARGET.result_classifications;
 create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
 compute stats TARGET.result_apc;
 create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
 compute stats TARGET.result_concepts;