From cae92cf8117420af1e643baa148796e77248fb81 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 6 Jun 2023 14:06:06 +0200 Subject: [PATCH 1/7] update sql query to return distinct pids --- .../dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql index 133b817d27..ae6caab7c1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql @@ -15,7 +15,10 @@ SELECT 'OpenOrgs Database' AS collectedfromname, o.country || '@@@dnet:countries' AS country, 'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction, - array_remove(array_cat(array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types'), array_agg(DISTINCT idup.otherid || '###' || idup.type || '@@@dnet:pid_types')), NULL) AS pid, + ARRAY(SELECT DISTINCT pid FROM unnest(array_cat( + array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types'), + array_agg(DISTINCT idup.otherid || '###' || idup.type || '@@@dnet:pid_types') + )) as t(pid) where pid IS NOT NULL), (array_remove(array_cat(ARRAY[o.ec_legalbody], array_agg(od.ec_legalbody)), NULL))[1] AS eclegalbody, (array_remove(array_cat(ARRAY[o.ec_legalperson], array_agg(od.ec_legalperson)), NULL))[1] AS eclegalperson, (array_remove(array_cat(ARRAY[o.ec_nonprofit], array_agg(od.ec_nonprofit)), NULL))[1] AS ecnonprofit, @@ -41,4 +44,4 @@ GROUP BY o.name, o.creation_date, o.modification_date, - o.country; \ No newline at end of file + o.country; From 5befd93d7d8ecc9c0d8691190c92a30e93d35884 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 6 Jun 2023 14:34:33 +0200 Subject: [PATCH 2/7] test records for Solr indexing --- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 17 +++ .../eu/dnetlib/dhp/oa/graph/raw/leiden.xml | 75 ++++++++++ .../provision/IndexRecordTransformerTest.java | 6 + .../dhp/oa/provision/eosc-future/sentinel.xml | 138 ++++++++++++++++++ 4 files changed, 236 insertions(+) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/leiden.xml create mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/sentinel.xml diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index bfd6d461dd..4f9522385d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -1023,6 +1023,23 @@ class MappersTest { } + @Test + void testLeiden() throws IOException { + final String xml = IOUtils + .toString(Objects.requireNonNull(getClass().getResourceAsStream("leiden.xml"))); + final List actual = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); + assertNotNull(actual); + assertFalse(actual.isEmpty()); + System.out.println("***************"); + System.out.println(new ObjectMapper().writeValueAsString(actual)); + System.out.println("***************"); + final Publication d = (Publication) actual.get(0); + assertValidId(d.getId()); + assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); + assertTrue(StringUtils.isNotBlank(d.getInstance().get(0).getUrl().get(0))); + + } + private void assertValidId(final String id) { // System.out.println(id); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/leiden.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/leiden.xml new file mode 100644 index 0000000000..ac674deb64 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/leiden.xml @@ -0,0 +1,75 @@ + + +
+ oai:scholarlypublications.universiteitleiden.nl:item_2870593 + 2021-11-03T14:09:07Z + hdl_1887_4540 + hdl_1887_4539 + hdl_1887_26883 + hdl_1887_20765 + open_access + 2023-05-18T01:24:03.623Z + od_______202::0032acf47e4939f8ae28554dfd1240de + 1887/9526 + 2023-03-17T13:30:02.026+01:00 + od_______202 +
+ + + 1887/9526 + + urn:nbn:nl:ui:26-1887/9526 + 22146427 + + + contribution to journal + NA + + open access + Leiden University Non-exclusive license + + + Tweesporenbeleid bij ontwikkeling Afrikaanse curricula: Afrika-Studiecentrum reageert op Peter Crossman + + nl + + application/pdf + + + + Hesseling, G.S.C.M. + G.S.C.M. + Hesseling + 069053138 + Afrika Studiecentrum + + + + 1999 + + + Africa + universities + + + 1887/9526 + urn:nbn:nl:ui:26-1887/9526 + 0038 + 1999-01-01 + OPEN + dut/nld + + + +
\ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java index 74f203cbfb..7d31790a2c 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java @@ -128,6 +128,12 @@ public class IndexRecordTransformerTest { testRecordTransformation(record); } + @Test + public void testForEOSCFutureSentinel() throws IOException, TransformerException { + final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/sentinel.xml")); + testRecordTransformation(record); + } + @Test public void testForEdithDemo() throws IOException, TransformerException { final String record = IOUtils.toString(getClass().getResourceAsStream("edith-demo/10.1098-rsta.2020.0257.xml")); diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/sentinel.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/sentinel.xml new file mode 100644 index 0000000000..475a375d32 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/sentinel.xml @@ -0,0 +1,138 @@ + + +
+ doi_dedup___::10a910f4a66b7f4bce8407d7a486a80a + 2023-04-05T00:36:27+0000 + 2023-04-05T07:33:52.185Z +
+ + + + + + 50|datacite____::10a910f4a66b7f4bce8407d7a486a80a + 10.5281/zenodo.6967373 + 50|datacite____::172969c66c312a9656fc745f0ec62ce5 + 10.5281/zenodo.6969999 + 50|datacite____::4fa8f1c89ff11e8e99f9ded870ade80d + 10.5281/zenodo.6967372 + 50|datacite____::a466b6173773d742b7a5881682748a8c + 10.5281/zenodo.6970067 + 10.5281/zenodo.6967373 + 10.5281/zenodo.6969999 + 10.5281/zenodo.6967372 + 10.5281/zenodo.6970067 + Sentinel-3 NDVI ARD and Long Term Statistics (1999-2019) from the Copernicus Global Land Service over Lombardia + + Marasco Pier Lorenzo + 2022-08-05 + Sentinel-3 NDVI Analysis Ready Data (ARD) (C_GLS_NDVI_20220101_20220701_Lombardia_S3_2.nc) product provided by the Copernicus Global Land Service [3]. The file C_GLS_NDVI_20220101_20220701_Lombardia_S3_2_masked.nc is derived from C_GLS_NDVI_20220101_20220701_Lombardia_S3_2.nc but values have been scaled (raw_value * ( 1/250) - 0.08) and values lower then -0.08 and greater than 0.92 have been removed (set to missing values). The original dataset can also be discovered through the OpenEO API[5] from the CGLS distributor VITO [4]. Access is free of charge but an EGI registration is needed. The file called Italy.geojson has been created using the Global Administrative Unit Layers GAUL G2015_2014 provided by FAO-UN (see Documentation). It only contains information related to Italy. Further info about drought indexes can be found in the Integrated Drought Management Programme [5] [1] Application of vegetation index and brightness temperature for drought detection [2] NDVI [3] Copernicus Global Land Service [4] Vito [5] OpenEO [5] Integrated Drought Management + These datasets are used for training purposes. See https://pangeo-data.github.io/foss4g-2022/intro.html + NDVI + vegetaion + Copernicus Global Land Service + pangeo + + 2022-08-05 + Zenodo + + + + + true + false + 0.8 + dedup-result-decisiontree-v3 + + + + + + Zenodo + 10.5281/zenodo.6967372 + 2022-08-05 + + Sentinel-3 NDVI ARD and Long Term Statistics (1999-2019) from the Copernicus Global Land Service over Lombardia + + + Zenodo + 10.5281/zenodo.6970067 + 2022-08-05 + + Sentinel-3 NDVI ARD and Long Term Statistics (1999-2019) from the Copernicus Global Land Service over Lombardia + + + Zenodo + 2022-08-05 + 10.5281/zenodo.6969999 + + Sentinel-3 NDVI ARD and Long Term Statistics (1999-2019) from the Copernicus Global Land Service over Lombardia + + + Zenodo + 2022-08-05 + + Sentinel-3 NDVI ARD and Long Term Statistics (1999-2019) from the Copernicus Global Land Service over Lombardia + 10.5281/zenodo.6967373 + + + + + + 2022-08-05 + + 10.5281/zenodo.6967373 + + https://creativecommons.org/licenses/by/4.0/legalcode + + https://doi.org/10.5281/zenodo.6967373 + + + + + + + 2022-08-05 + + 10.5281/zenodo.6970067 + + https://creativecommons.org/licenses/by/4.0/legalcode + + https://doi.org/10.5281/zenodo.6970067 + + + + + + + 2022-08-05 + + 10.5281/zenodo.6969999 + + https://creativecommons.org/licenses/by/4.0/legalcode + + https://doi.org/10.5281/zenodo.6969999 + + + + + + + 2022-08-05 + + 10.5281/zenodo.6967372 + + https://creativecommons.org/licenses/by/4.0/legalcode + + https://doi.org/10.5281/zenodo.6967372 + + + + + + +
+
\ No newline at end of file From 118e72d7db65f1f7e2738b2e4d3509ce6eb3dfd6 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 6 Jun 2023 14:39:12 +0200 Subject: [PATCH 3/7] Updated officialnmae of pangaea in hostedbymap for Datacite to avoid duplicate entries in the source filter of the portal --- .../main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json index ecae6811a1..9088d29600 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json @@ -642,12 +642,12 @@ "PANGAEA.REPOSITORY": { "openaire_id": "re3data_____::r3d100010134", "datacite_name": "PANGAEA", - "official_name": "PANGAEA" + "official_name": "PANGAEA - Data Publisher for Earth and Environmental Science" }, "TIB.PANGAEA": { "openaire_id": "re3data_____::r3d100010134", "datacite_name": "PANGAEA", - "official_name": "PANGAEA" + "official_name": "PANGAEA - Data Publisher for Earth and Environmental Science" }, "NASAPDS.NASAPDS": { "openaire_id": "re3data_____::r3d100010121", From a92206dab5ee5b47deef8c6bf8ae5d36100c2557 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 13 Jun 2023 11:43:10 +0200 Subject: [PATCH 4/7] re-added the name of a column (pid) --- .../eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql index ae6caab7c1..87e36d51be 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryOpenOrgsForProvision.sql @@ -18,7 +18,7 @@ SELECT ARRAY(SELECT DISTINCT pid FROM unnest(array_cat( array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types'), array_agg(DISTINCT idup.otherid || '###' || idup.type || '@@@dnet:pid_types') - )) as t(pid) where pid IS NOT NULL), + )) as t(pid) where pid IS NOT NULL) AS pid, (array_remove(array_cat(ARRAY[o.ec_legalbody], array_agg(od.ec_legalbody)), NULL))[1] AS eclegalbody, (array_remove(array_cat(ARRAY[o.ec_legalperson], array_agg(od.ec_legalperson)), NULL))[1] AS eclegalperson, (array_remove(array_cat(ARRAY[o.ec_nonprofit], array_agg(od.ec_nonprofit)), NULL))[1] AS ecnonprofit, From 485f9d18cbebf00969b0f9e795de5c8fdb49f9aa Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 19 Jun 2023 13:00:02 +0200 Subject: [PATCH 5/7] REmove duplicated code and ensure that load and initialization is done through "DedupConfig.load" method --- .../java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java index 1364133763..68af3d6994 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java @@ -88,9 +88,7 @@ abstract class AbstractSparkAction implements Serializable { "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", configProfileId)); - DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class); - dedupConfig.getPace().initModel(); - dedupConfig.getPace().initTranslationMap(); + DedupConfig dedupConfig = DedupConfig.load(conf); dedupConfig.getWf().setConfigurationId(actionSetId); return dedupConfig; From 758e662ab856fb3af626457be8ad6a0b57ad6782 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 19 Jun 2023 13:08:10 +0200 Subject: [PATCH 6/7] Revert "REmove duplicated code and ensure that load and initialization is done through "DedupConfig.load" method" This reverts commit 485f9d18cbebf00969b0f9e795de5c8fdb49f9aa. --- .../java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java index 68af3d6994..1364133763 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java @@ -88,7 +88,9 @@ abstract class AbstractSparkAction implements Serializable { "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", configProfileId)); - DedupConfig dedupConfig = DedupConfig.load(conf); + DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class); + dedupConfig.getPace().initModel(); + dedupConfig.getPace().initTranslationMap(); dedupConfig.getWf().setConfigurationId(actionSetId); return dedupConfig; From 009d7f312f66b5338968986bb3c3d480b250ada5 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 21 Jun 2023 16:17:34 +0200 Subject: [PATCH 7/7] fixed a datasource Id --- .../src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala index ffdab1799a..c5c6026f19 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala @@ -82,7 +82,7 @@ object BioDBToOAF { "Springer Nature" ) val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue( - "10|opendoar____::83e60e09c222f206c725385f53d7e567c", + "10|opendoar____::3e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)" ) val pubmedCollectedFrom: KeyValue =