From f1d7d45cf7b210c43d8af9635220bc710fce2341 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 28 Sep 2022 12:01:43 +0200 Subject: [PATCH 01/30] [BulkTag] fixed issue --- .../dnetlib/dhp/bulktag/community/ResultTagger.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java index ee75bf955..0452a6ebf 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java @@ -158,7 +158,8 @@ public class ResultTagger implements Serializable { } result.getContext().forEach(c -> { - if (communities.contains(c.getId())) { + final String cId = c.getId(); + if (communities.contains(cId)) { Optional> opt_dataInfoList = Optional.ofNullable(c.getDataInfo()); List dataInfoList; if (opt_dataInfoList.isPresent()) @@ -167,7 +168,7 @@ public class ResultTagger implements Serializable { dataInfoList = new ArrayList<>(); c.setDataInfo(dataInfoList); } - if (subjects.contains(c)) + if (subjects.contains(cId)) dataInfoList .add( OafMapperUtils @@ -178,7 +179,7 @@ public class ResultTagger implements Serializable { CLASS_ID_SUBJECT, CLASS_NAME_BULKTAG_SUBJECT, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST)); - if (datasources.contains(c)) + if (datasources.contains(cId)) dataInfoList .add( OafMapperUtils @@ -189,7 +190,7 @@ public class ResultTagger implements Serializable { CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST)); - if (czenodo.contains(c)) + if (czenodo.contains(cId)) dataInfoList .add( OafMapperUtils @@ -200,7 +201,7 @@ public class ResultTagger implements Serializable { CLASS_ID_CZENODO, CLASS_NAME_BULKTAG_ZENODO, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), TAGGING_TRUST)); - if (aconstraints.contains(c)) + if (aconstraints.contains(cId)) dataInfoList .add( OafMapperUtils From b5b5a4c1928679ab308bcfe70d7969f0c1b99664 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 28 Sep 2022 12:42:51 +0200 Subject: [PATCH 02/30] [CleanCountry] fixed issue --- .../dhp/oa/graph/clean/country/CleanCountrySparkJob.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java index c150c63df..cd77f342e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java @@ -117,7 +117,7 @@ public class CleanCountrySparkJob implements Serializable { p -> p .getQualifier() .getClassid() - .equals(PidType.doi) && pidInParam(p.getValue(), verifyParam))) { + .equals(PidType.doi.toString()) && pidInParam(p.getValue(), verifyParam))) { r .setCountry( r From 2ebb1459a9c01eeda8d81865c75585e361cc5426 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 28 Sep 2022 14:36:57 +0300 Subject: [PATCH 03/30] Fixed type in no_downloads --- .../scripts/step16-createIndicatorsTables.sql | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 417ed6e4e..1bda07629 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -454,16 +454,16 @@ FROM publication_datasources pd compute stats indi_pub_hybrid_oa_with_cc; create table indi_pub_downloads stored as parquet as -SELECT result_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats +SELECT result_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join publication on result_id=id where downloads>0 GROUP BY result_id -order by no_dowloads desc; +order by no_downloads desc; compute stats indi_pub_downloads; create table indi_pub_downloads_datasource stored as parquet as -SELECT result_id, repository_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats +SELECT result_id, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats join publication on result_id=id where downloads>0 GROUP BY result_id, repository_id @@ -472,7 +472,7 @@ order by result_id; compute stats indi_pub_downloads_datasource; create table indi_pub_downloads_year stored as parquet as -SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats us +SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us join publication on result_id=id where downloads>0 GROUP BY result_id, `year` order by `year` asc; @@ -480,7 +480,7 @@ order by `year` asc; compute stats indi_pub_downloads_year; create table indi_pub_downloads_datasource_year stored as parquet as -SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats us +SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_downloads from openaire_prod_usage_stats.usage_stats us join publication on result_id=id where downloads>0 GROUP BY result_id, repository_id, `year` From bdc46e3eaab57a926977d162131d78933566c08b Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 28 Sep 2022 14:59:08 +0300 Subject: [PATCH 04/30] Remove denormalization of results to fix downloads numbers in monitor --- .../oozie_app/scripts/step20-createMonitorDB.sql | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 290acbf9f..2505c3a34 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -39,7 +39,6 @@ create table TARGET.result stored as parquet as 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII) - 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly @@ -224,18 +223,3 @@ create table TARGET.indi_result_with_pid stored as parquet as select * from SOUR --create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); --compute stats TARGET.indi_software_gold_oa; ---denorm -alter table TARGET.result rename to TARGET.res_tmp; - -create table TARGET.result_denorm stored as parquet as - select distinct r.*, rp.project, p.acronym as pacronym, p.title as ptitle, p.funder as pfunder, p.funding_lvl0 as pfunding_lvl0, rd.datasource, d.name as dname, d.type as dtype - from TARGET.res_tmp r - left outer join TARGET.result_projects rp on rp.id=r.id - left outer join TARGET.result_datasources rd on rd.id=r.id - left outer join TARGET.project p on p.id=rp.project - left outer join TARGET.datasource d on d.id=rd.datasource; -compute stats TARGET.result_denorm; - -alter table TARGET.result_denorm rename to TARGET.result; -drop table TARGET.res_tmp; ---- done! \ No newline at end of file From 49360770d71d8930838240866a1cbc3ea06b465d Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Wed, 28 Sep 2022 14:16:39 +0200 Subject: [PATCH 05/30] map w3id as instance url --- .../dhp/oa/graph/raw/OdfToOafMapper.java | 3 ++ .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 17 +++++++---- .../eu/dnetlib/dhp/oa/graph/raw/rohub.xml | 30 ++++++++----------- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index a25bcd47e..39c77bd37 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -177,6 +177,9 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) { url.add(trimAndDecodeUrl(((Node) o).getText().trim())); } + for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='w3id']")) { + url.add(trimAndDecodeUrl(((Node) o).getText().trim())); + } Set validUrl = validateUrl(url); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 7552d1789..f0eadbd0d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -936,11 +936,18 @@ class MappersTest { System.out.println("***************"); System.out.println(new ObjectMapper().writeValueAsString(list)); System.out.println("***************"); -// final OtherResearchProduct p = (OtherResearchProduct) list.get(0); -// assertValidId(p.getId()); -// assertValidId(p.getCollectedfrom().get(0).getKey()); -// System.out.println(p.getTitle().get(0).getValue()); -// assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + assertEquals(5, list.size()); + final OtherResearchProduct p = (OtherResearchProduct) list.get(0); + assertValidId(p.getId()); + assertTrue(p.getId().startsWith("50|w3id")); + assertValidId(p.getCollectedfrom().get(0).getKey()); + assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + assertEquals(1, p.getInstance().size()); + assertEquals("https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", p.getPid().get(0).getValue()); + Instance inst = p.getInstance().get(0); + assertEquals("https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", inst.getPid().get(0).getValue()); + assertEquals("https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", inst.getUrl().get(0)); + } @Test diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub.xml index c85b55786..e1e30c3de 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub.xml @@ -21,15 +21,13 @@ - https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca - + https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca + + https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca + - - https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca/resources/24fae96f-f986-46e1-bfd0-a21ca20ff0ce - - - https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca/resources/6d3427a8-352e-49f4-9796-f618c44dc16d - + https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca/resources/24fae96f-f986-46e1-bfd0-a21ca20ff0ce + https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca/resources/6d3427a8-352e-49f4-9796-f618c44dc16d RO-crate @@ -43,21 +41,17 @@ Poznań Supercomputing and Networking Center - - - Generation Service - + + Generation Service - - CNR-ISMAR - + CNR-ISMAR - 2018-06-20T11:21:46Z + 2018-06-20T11:21:46Z The use of biological effects tools offer enormous potential to meet the challenges outlined by the European Union Marine Strategy Framework Directive (MSFD) whereby Member States are required to develop a robust set of tools for defining 11 qualitative descriptors of Good Environmental Status (GES), such as demonstrating that "Concentrations of contaminants are at levels not giving rise to pollution effects" (GES Descriptor 8). This paper discusses the combined approach of monitoring chemical contaminant levels, along side biological effect measurements relating to the effect of pollutants, for undertaking assessments of GES across European marine regions. We outline the minimum standards that biological effects tools should meet if they are to be used for defining GES in relation to Descriptor 8 and describe the current international initiatives underway to develop assessment criteria for these biological effects techniques. Crown Copyright (C) 2010 Published by Elsevier Ltd. All rights reserved. @@ -74,9 +68,9 @@ EOSC::RO-crate - https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca + https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca other research product - + 2018-06-20 OPEN From 2c0c3f18064da57d31f88639b41f94348a404b6e Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 28 Sep 2022 19:33:24 +0300 Subject: [PATCH 06/30] Cast amount to float for table result_apcs --- .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql index 04c7f83b9..86ead4a2c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql @@ -42,7 +42,7 @@ join ${stats_db_name}.result res on res.id=r.id; create table ${stats_db_name}.result_apc as select r.id, r.amount, r.currency from ( - select substr(r.id, 4) as id, inst.processingchargeamount.value as amount, inst.processingchargecurrency.value as currency + select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r join ${stats_db_name}.result res on res.id=r.id where r.amount is not null; From 188f25eefae15d23eb92dbd977d6e41fd78af157 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 3 Oct 2022 12:42:52 +0200 Subject: [PATCH 07/30] [BipFinder] Fixed issue for wrong escaped char in doi --- .../createunresolvedentities/PrepareBipFinder.java | 6 ++++-- .../actionmanager/createunresolvedentities/PrepareTest.java | 5 ++++- .../dhp/actionmanager/createunresolvedentities/bip/bip.json | 3 ++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java index 80573c71a..a4f5c22bc 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java @@ -11,6 +11,7 @@ import java.util.List; import java.util.Optional; import java.util.stream.Collectors; +import com.fasterxml.jackson.core.JsonParser; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -40,7 +41,6 @@ import eu.dnetlib.dhp.utils.DHPUtils; public class PrepareBipFinder implements Serializable { private static final Logger log = LoggerFactory.getLogger(PrepareBipFinder.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static void main(String[] args) throws Exception { @@ -82,9 +82,11 @@ public class PrepareBipFinder implements Serializable { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + ObjectMapper mapper = new ObjectMapper() + .configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER, true); JavaRDD bipDeserializeJavaRDD = sc .textFile(inputPath) - .map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class)); + .map(item -> mapper.readValue(item, BipDeserialize.class)); spark .createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> { diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java index cc8108bde..d0ce69043 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java @@ -88,7 +88,7 @@ public class PrepareTest { .textFile(workingDir.toString() + "/work/bip") .map(item -> OBJECT_MAPPER.readValue(item, Result.class)); - Assertions.assertEquals(86, tmp.count()); + Assertions.assertEquals(87, tmp.count()); String doi1 = "unresolved::10.0000/096020199389707::doi"; @@ -151,6 +151,9 @@ public class PrepareTest { Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).count()); Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).collect().get(0).getInstance().size()); + tmp.filter(r -> r.getId().startsWith("unresolved::10.2111/1551-5028(2004)057")) + .foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r))); + } @Test diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json index 03cef4be1..3a077ab5c 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json @@ -83,4 +83,5 @@ {"10.0000/hoplos.v4i7.41295": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} {"10.0000/hoplos.v4i7.42830": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} {"10.0000/hoplos.v4i7.42861": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} -{"10.0000/hoplos.v4i7.43096": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} \ No newline at end of file +{"10.0000/hoplos.v4i7.43096": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} +{"10.2111/1551-5028(2004)057\[0539:sdsocg\]2.0.co;2": [{"id":"influence", "unit":[{"key":"score","value":"6.3290875E-9"},{"key":"class","value":"C"}]}, {"id":"popularity", "unit":[{"key":"score","value":"6.576763E-9"},{"key":"class","value":"C"}]}, {"id":"influence_alt", "unit":[{"key":"score","value":"11"},{"key":"class","value":"C"}]}, {"id":"popularity_alt", "unit":[{"key":"score","value":"1.0142108"},{"key":"class","value":"C"}]}, {"id":"impulse", "unit":[{"key":"score","value":"1"},{"key":"class","value":"C"}]}]} \ No newline at end of file From 28dc317350d792e75a843e8d4c9d6b2ea5735a2c Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 4 Oct 2022 09:47:27 +0200 Subject: [PATCH 08/30] [BipFinder] refactoring --- .../createunresolvedentities/PrepareBipFinder.java | 4 ++-- .../actionmanager/createunresolvedentities/PrepareTest.java | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java index a4f5c22bc..2d2633e0f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java @@ -11,7 +11,6 @@ import java.util.List; import java.util.Optional; import java.util.stream.Collectors; -import com.fasterxml.jackson.core.JsonParser; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -23,6 +22,7 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize; @@ -83,7 +83,7 @@ public class PrepareBipFinder implements Serializable { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); ObjectMapper mapper = new ObjectMapper() - .configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER, true); + .configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER, true); JavaRDD bipDeserializeJavaRDD = sc .textFile(inputPath) .map(item -> mapper.readValue(item, BipDeserialize.class)); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java index d0ce69043..6ae1f246d 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java @@ -151,8 +151,9 @@ public class PrepareTest { Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).count()); Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).collect().get(0).getInstance().size()); - tmp.filter(r -> r.getId().startsWith("unresolved::10.2111/1551-5028(2004)057")) - .foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r))); + tmp + .filter(r -> r.getId().startsWith("unresolved::10.2111/1551-5028(2004)057")) + .foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r))); } From 7324853a1720821fc7d2ce4f1db94cd5f9892932 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 4 Oct 2022 14:29:39 +0200 Subject: [PATCH 09/30] Revert "[BipFinder] refactoring" This reverts commit 28dc317350d792e75a843e8d4c9d6b2ea5735a2c. --- .../createunresolvedentities/PrepareBipFinder.java | 4 ++-- .../actionmanager/createunresolvedentities/PrepareTest.java | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java index 2d2633e0f..a4f5c22bc 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java @@ -11,6 +11,7 @@ import java.util.List; import java.util.Optional; import java.util.stream.Collectors; +import com.fasterxml.jackson.core.JsonParser; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -22,7 +23,6 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize; @@ -83,7 +83,7 @@ public class PrepareBipFinder implements Serializable { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); ObjectMapper mapper = new ObjectMapper() - .configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER, true); + .configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER, true); JavaRDD bipDeserializeJavaRDD = sc .textFile(inputPath) .map(item -> mapper.readValue(item, BipDeserialize.class)); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java index 6ae1f246d..d0ce69043 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java @@ -151,9 +151,8 @@ public class PrepareTest { Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).count()); Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).collect().get(0).getInstance().size()); - tmp - .filter(r -> r.getId().startsWith("unresolved::10.2111/1551-5028(2004)057")) - .foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r))); + tmp.filter(r -> r.getId().startsWith("unresolved::10.2111/1551-5028(2004)057")) + .foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r))); } From 4d8339614b5e1240243e4a139d494e1988f6746e Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 4 Oct 2022 14:29:47 +0200 Subject: [PATCH 10/30] Revert "[BipFinder] Fixed issue for wrong escaped char in doi" This reverts commit 188f25eefae15d23eb92dbd977d6e41fd78af157. --- .../createunresolvedentities/PrepareBipFinder.java | 6 ++---- .../actionmanager/createunresolvedentities/PrepareTest.java | 5 +---- .../dhp/actionmanager/createunresolvedentities/bip/bip.json | 3 +-- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java index a4f5c22bc..80573c71a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java @@ -11,7 +11,6 @@ import java.util.List; import java.util.Optional; import java.util.stream.Collectors; -import com.fasterxml.jackson.core.JsonParser; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -41,6 +40,7 @@ import eu.dnetlib.dhp.utils.DHPUtils; public class PrepareBipFinder implements Serializable { private static final Logger log = LoggerFactory.getLogger(PrepareBipFinder.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static void main(String[] args) throws Exception { @@ -82,11 +82,9 @@ public class PrepareBipFinder implements Serializable { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - ObjectMapper mapper = new ObjectMapper() - .configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER, true); JavaRDD bipDeserializeJavaRDD = sc .textFile(inputPath) - .map(item -> mapper.readValue(item, BipDeserialize.class)); + .map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class)); spark .createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> { diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java index d0ce69043..cc8108bde 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java @@ -88,7 +88,7 @@ public class PrepareTest { .textFile(workingDir.toString() + "/work/bip") .map(item -> OBJECT_MAPPER.readValue(item, Result.class)); - Assertions.assertEquals(87, tmp.count()); + Assertions.assertEquals(86, tmp.count()); String doi1 = "unresolved::10.0000/096020199389707::doi"; @@ -151,9 +151,6 @@ public class PrepareTest { Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).count()); Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).collect().get(0).getInstance().size()); - tmp.filter(r -> r.getId().startsWith("unresolved::10.2111/1551-5028(2004)057")) - .foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r))); - } @Test diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json index 3a077ab5c..03cef4be1 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json @@ -83,5 +83,4 @@ {"10.0000/hoplos.v4i7.41295": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} {"10.0000/hoplos.v4i7.42830": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} {"10.0000/hoplos.v4i7.42861": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} -{"10.0000/hoplos.v4i7.43096": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} -{"10.2111/1551-5028(2004)057\[0539:sdsocg\]2.0.co;2": [{"id":"influence", "unit":[{"key":"score","value":"6.3290875E-9"},{"key":"class","value":"C"}]}, {"id":"popularity", "unit":[{"key":"score","value":"6.576763E-9"},{"key":"class","value":"C"}]}, {"id":"influence_alt", "unit":[{"key":"score","value":"11"},{"key":"class","value":"C"}]}, {"id":"popularity_alt", "unit":[{"key":"score","value":"1.0142108"},{"key":"class","value":"C"}]}, {"id":"impulse", "unit":[{"key":"score","value":"1"},{"key":"class","value":"C"}]}]} \ No newline at end of file +{"10.0000/hoplos.v4i7.43096": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} \ No newline at end of file From 8d97949316a93884a123746cec0d60f6c34b2a2e Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 7 Oct 2022 09:52:45 +0200 Subject: [PATCH 11/30] [cleaning] fixed loop in wf nodes --- .../eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml index 2ba0a7ad7..08e74a5e5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml @@ -492,7 +492,7 @@ --datasourcePath${workingDir}/working/hostedby --collectedfrom${collectedfrom} - + @@ -521,7 +521,7 @@ --datasourcePath${workingDir}/working/hostedby --collectedfrom${collectedfrom} - + @@ -550,7 +550,7 @@ --datasourcePath${workingDir}/working/hostedby --collectedfrom${collectedfrom} - + From ece40adc09b5124386fb4f28391f7b151ea0ceea Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 11 Oct 2022 10:10:20 +0200 Subject: [PATCH 12/30] [cleaning] fixing NPE in the country cleaning phase --- .../dhp/oa/graph/clean/country/GetDatasourceFromCountry.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/GetDatasourceFromCountry.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/GetDatasourceFromCountry.java index dd5af6998..d3741d3e8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/GetDatasourceFromCountry.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/GetDatasourceFromCountry.java @@ -65,7 +65,6 @@ public class GetDatasourceFromCountry implements Serializable { conf, isSparkSessionManaged, spark -> { - getDatasourceFromCountry(spark, country, inputPath, workingPath); }); } @@ -83,7 +82,6 @@ public class GetDatasourceFromCountry implements Serializable { (FilterFunction) o -> !o.getDataInfo().getDeletedbyinference() && o.getCountry().getClassid().length() > 0 && o.getCountry().getClassid().equals(country)); - ; // filtering of the relations taking the non deleted by inference and those with IsProvidedBy as relclass Dataset relation = spark @@ -97,7 +95,7 @@ public class GetDatasourceFromCountry implements Serializable { !rel.getDataInfo().getDeletedbyinference()); organization - .joinWith(relation, organization.col("id").equalTo(relation.col("target")), "left") + .joinWith(relation, organization.col("id").equalTo(relation.col("target"))) .map((MapFunction, String>) t2 -> t2._2().getSource(), Encoders.STRING()) .write() .mode(SaveMode.Overwrite) From b301e9fdfffa4da0240981f97e0bd3bb5dbbfcdc Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 11 Oct 2022 11:08:52 +0200 Subject: [PATCH 13/30] [cleaning] renamed action name/description --- .../eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml index 08e74a5e5..19e1b2a02 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml @@ -432,14 +432,14 @@ - + - + yarn cluster - Clean publications context + Select datasource ID from country eu.dnetlib.dhp.oa.graph.clean.country.GetDatasourceFromCountry dhp-graph-mapper-${projectVersion}.jar From 6163ecbf63858f5a3b34cffa19b63fa59f851d9a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 11 Oct 2022 11:20:03 +0200 Subject: [PATCH 14/30] [cleaning] renamed parameters in wf action --- .../dhp/oa/graph/clean/oozie_app/workflow.xml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml index 19e1b2a02..6435d5131 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml @@ -471,7 +471,7 @@ yarn cluster - Clean publications counmtry + Clean publication country eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob dhp-graph-mapper-${projectVersion}.jar @@ -489,7 +489,7 @@ --workingPath${workingDir}/working/publication --country${country} --verifyParam${verifyCountryParam} - --datasourcePath${workingDir}/working/hostedby + --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} @@ -500,7 +500,7 @@ yarn cluster - Clean datasets Country + Clean dataset country eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob dhp-graph-mapper-${projectVersion}.jar @@ -518,7 +518,7 @@ --workingPath${workingDir}/working/dataset --country${country} --verifyParam${verifyCountryParam} - --datasourcePath${workingDir}/working/hostedby + --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} @@ -529,7 +529,7 @@ yarn cluster - Clean otherresearchproducts country + Clean otherresearchproduct country eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob dhp-graph-mapper-${projectVersion}.jar @@ -547,7 +547,7 @@ --workingPath${workingDir}/working/otherresearchproduct --country${country} --verifyParam${verifyCountryParam} - --datasourcePath${workingDir}/working/hostedby + --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} @@ -558,7 +558,7 @@ yarn cluster - Clean softwares country + Clean software country eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob dhp-graph-mapper-${projectVersion}.jar @@ -576,7 +576,7 @@ --workingPath${workingDir}/working/software --country${country} --verifyParam${verifyCountryParam} - --datasourcePath${workingDir}/working/hostedby + --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} From b47aaf4dd17b9446bd423637391fbe83aab80775 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 13 Oct 2022 11:23:43 +0200 Subject: [PATCH 15/30] [cleaning] subjects declared as belonging to specific vocabularies whose values are not found in the vocab are set to type keyword --- .../oaf/utils/GraphCleaningFunctions.java | 2 +- .../dhp/oa/graph/clean/CleaningRuleMap.java | 37 ++++++++++--------- .../clean/country/CleanCountrySparkJob.java | 2 +- .../clean/GraphCleaningFunctionsTest.java | 10 +++++ .../eu/dnetlib/dhp/oa/graph/clean/result.json | 22 +++++++++++ 5 files changed, 54 insertions(+), 19 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 775f228eb..363f95423 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -211,7 +211,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { .orElse(s.getValue()), Function.identity(), (s1, s2) -> Collections - .min(Lists.newArrayList(s1, s1), new SubjectProvenanceComparator()))) + .min(Lists.newArrayList(s1, s2), new SubjectProvenanceComparator()))) .values()); r.setSubject(subjects); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java index 147e26699..5f3b4e1ca 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.oa.graph.clean; import java.io.Serializable; import java.util.HashMap; +import java.util.Objects; import java.util.concurrent.atomic.AtomicReference; import org.apache.commons.lang3.SerializationUtils; @@ -10,6 +11,7 @@ import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; @@ -31,29 +33,30 @@ public class CleaningRuleMap extends HashMap, SerializableConsumer modified = new AtomicReference<>(false); + vocabularies.find(vocabularyId).ifPresent(vocabulary -> { - if (!ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) { - return; - } - Qualifier newValue = vocabulary.lookup(subject.getValue()); - if (!ModelConstants.UNKNOWN.equals(newValue.getClassid())) { - subject.setValue(newValue.getClassid()); - subject.getQualifier().setClassid(vocabularyId); - subject.getQualifier().setClassname(vocabulary.getName()); - modified.set(true); + if (ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) { + Qualifier newValue = vocabulary.lookup(subject.getValue()); + if (!ModelConstants.UNKNOWN.equals(newValue.getClassid())) { + subject.setValue(newValue.getClassid()); + subject.getQualifier().setClassid(vocabularyId); + subject.getQualifier().setClassname(vocabulary.getName()); + } + } else if (vocabularyId.equals(subject.getQualifier().getClassid())) { + Qualifier syn = vocabulary.getSynonymAsQualifier(subject.getValue()); + VocabularyTerm term = vocabulary.getTerm(subject.getValue()); + if (Objects.isNull(syn) && Objects.isNull(term)) { + subject.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_KEYWORD); + subject.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_KEYWORD); + } } }); - return modified.get(); } private static void cleanRelation(VocabularyGroup vocabularies, Relation r) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java index cd77f342e..45590f789 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java @@ -43,7 +43,7 @@ public class CleanCountrySparkJob implements Serializable { String jsonConfiguration = IOUtils .toString( - CleanContextSparkJob.class + CleanCountrySparkJob.class .getResourceAsStream( "/eu/dnetlib/dhp/oa/graph/input_clean_country_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index 6c43da832..4035307e5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -278,6 +278,16 @@ public class GraphCleaningFunctionsTest { s -> "0102 computer and information sciences".equals(s.getValue()) & ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))); + List s1 = p_cleaned + .getSubject() + .stream() + .filter(s -> s.getValue().equals("In Situ Hybridization")) + .collect(Collectors.toList()); + assertNotNull(s1); + assertEquals(1, s1.size()); + assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get(0).getQualifier().getClassid()); + assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get(0).getQualifier().getClassname()); + // TODO add more assertions to verity the cleaned values System.out.println(MAPPER.writeValueAsString(p_cleaned)); } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 8e4fc4545..84ff35c08 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -706,6 +706,28 @@ "source": [ ], "subject": [ + { + "dataInfo": { + "provenanceaction": { + "classid": "sysimport:crosswalk:repository", + "classname": "sysimport:crosswalk:repository", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "deletedbyinference": false, + "inferred": false, + "inferenceprovenance": "", + "invisible": false, + "trust": "0.9" + }, + "qualifier": { + "classid": "FOS", + "classname": "Fields of Science and Technology classification", + "schemeid": "dnet:result_subject", + "schemename": "dnet:result_subject" + }, + "value": "In Situ Hybridization" + }, { "dataInfo": { "deletedbyinference": false, From a1f94530a3a170b8095de027660df0c2d4a35b09 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 13 Oct 2022 11:47:11 +0200 Subject: [PATCH 16/30] added documentation --- .../DataciteToOAFTransformation.scala | 122 +++++++++++------- 1 file changed, 74 insertions(+), 48 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala index c29614d33..29f5cb99c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala @@ -24,11 +24,11 @@ import scala.io.Source object DataciteToOAFTransformation { case class HostedByMapType( - openaire_id: String, - datacite_name: String, - official_name: String, - similarity: Option[Float] - ) {} + openaire_id: String, + datacite_name: String, + official_name: String, + similarity: Option[Float] + ) {} val mapper = new ObjectMapper() @@ -47,12 +47,12 @@ object DataciteToOAFTransformation { } /** This method should skip record if json contains invalid text - * defined in file datacite_filter - * - * @param record : unparsed datacite record - * @param json : parsed record - * @return True if the record should be skipped - */ + * defined in file datacite_filter + * + * @param record : not parsed Datacite record + * @param json : parsed record + * @return True if the record should be skipped + */ def skip_record(record: String, json: org.json4s.JValue): Boolean = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats datacite_filter.exists(f => record.contains(f)) || (json \\ "publisher") @@ -98,6 +98,11 @@ object DataciteToOAFTransformation { } + /** + * This utility method indicates whether the embargo date has been reached + * @param embargo_end_date + * @return True if the embargo date has been reached, false otherwise + */ def embargo_end(embargo_end_date: String): Boolean = { val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]")) val td = LocalDate.now() @@ -142,12 +147,27 @@ object DataciteToOAFTransformation { } } + /*** + * Use the vocabulary dnet:publication_resource to find a synonym to one of these terms and get the instance.type. + * Using the dnet:result_typologies vocabulary, we look up the instance.type synonym + * to generate one of the following main entities: + * - publication + * - dataset + * - software + * - otherresearchproduct + + * @param resourceType + * @param resourceTypeGeneral + * @param schemaOrg + * @param vocabularies + * @return + */ def getTypeQualifier( - resourceType: String, - resourceTypeGeneral: String, - schemaOrg: String, - vocabularies: VocabularyGroup - ): (Qualifier, Qualifier) = { + resourceType: String, + resourceTypeGeneral: String, + schemaOrg: String, + vocabularies: VocabularyGroup + ): (Qualifier, Qualifier) = { if (resourceType != null && resourceType.nonEmpty) { val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType) @@ -192,11 +212,11 @@ object DataciteToOAFTransformation { } def getResult( - resourceType: String, - resourceTypeGeneral: String, - schemaOrg: String, - vocabularies: VocabularyGroup - ): Result = { + resourceType: String, + resourceTypeGeneral: String, + schemaOrg: String, + vocabularies: VocabularyGroup + ): Result = { val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies) if (typeQualifiers == null) @@ -238,11 +258,11 @@ object DataciteToOAFTransformation { } /** As describe in ticket #6377 - * when the result come from figshare we need to remove subject - * and set Access rights OPEN. - * - * @param r - */ + * when the result come from figshare we need to remove subject + * and set Access rights OPEN. + * + * @param r + */ def fix_figshare(r: Result): Unit = { if (r.getInstance() != null) { @@ -269,12 +289,12 @@ object DataciteToOAFTransformation { } def generateRelation( - sourceId: String, - targetId: String, - relClass: String, - cf: KeyValue, - di: DataInfo - ): Relation = { + sourceId: String, + targetId: String, + relClass: String, + cf: KeyValue, + di: DataInfo + ): Relation = { val r = new Relation r.setSource(sourceId) @@ -303,12 +323,12 @@ object DataciteToOAFTransformation { } def generateOAF( - input: String, - ts: Long, - dateOfCollection: Long, - vocabularies: VocabularyGroup, - exportLinks: Boolean - ): List[Oaf] = { + input: String, + ts: Long, + dateOfCollection: Long, + vocabularies: VocabularyGroup, + exportLinks: Boolean + ): List[Oaf] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json = parse(input) @@ -330,6 +350,7 @@ object DataciteToOAFTransformation { if (result == null) return List() + // DOI is mapped on a PID inside a Instance object val doi_q = OafMapperUtils.qualifier( "doi", "doi", @@ -338,6 +359,8 @@ object DataciteToOAFTransformation { ) val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo) result.setPid(List(pid).asJava) + + // This identifiere will be replaced in a second moment using the PID logic generation result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true)) result.setOriginalId(List(doi).asJava) @@ -386,6 +409,11 @@ object DataciteToOAFTransformation { a } + if (authors == null || authors.isEmpty || !authors.exists(a => a != null)) + return List() + result.setAuthor(authors.asJava) + + val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List()) result.setTitle( @@ -409,9 +437,7 @@ object DataciteToOAFTransformation { .asJava ) - if (authors == null || authors.isEmpty || !authors.exists(a => a != null)) - return List() - result.setAuthor(authors.asJava) + val dates = (json \\ "dates").extract[List[DateType]] val publication_year = (json \\ "publicationYear").extractOrElse[String](null) @@ -619,16 +645,16 @@ object DataciteToOAFTransformation { } private def generateRelations( - rels: List[RelatedIdentifierType], - id: String, - date: String - ): List[Relation] = { + rels: List[RelatedIdentifierType], + id: String, + date: String + ): List[Relation] = { rels .filter(r => subRelTypeMapping .contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") || - r.relatedIdentifierType.equalsIgnoreCase("pmid") || - r.relatedIdentifierType.equalsIgnoreCase("arxiv")) + r.relatedIdentifierType.equalsIgnoreCase("pmid") || + r.relatedIdentifierType.equalsIgnoreCase("arxiv")) ) .map(r => { val rel = new Relation @@ -660,4 +686,4 @@ object DataciteToOAFTransformation { s"10|$b::${DHPUtils.md5(a)}" } -} +} \ No newline at end of file From ae7cd0735a8528cd419ce2863077d436bd34cd0d Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 14 Oct 2022 15:47:58 +0200 Subject: [PATCH 17/30] [graph2hive] more partitions --- .../eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml index ba5f4f375..4468382be 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml @@ -126,6 +126,7 @@ --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Publication --hiveMetastoreUris${hiveMetastoreUris} + --numPartitions8000 @@ -152,6 +153,7 @@ --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Dataset --hiveMetastoreUris${hiveMetastoreUris} + --numPartitions4000 @@ -178,6 +180,7 @@ --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct --hiveMetastoreUris${hiveMetastoreUris} + --numPartitions3000 @@ -204,6 +207,7 @@ --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Software --hiveMetastoreUris${hiveMetastoreUris} + --numPartitions300 @@ -230,6 +234,7 @@ --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Datasource --hiveMetastoreUris${hiveMetastoreUris} + --numPartitions100 @@ -256,6 +261,7 @@ --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Organization --hiveMetastoreUris${hiveMetastoreUris} + --numPartitions400 @@ -309,6 +315,7 @@ --hiveDbName${hiveDbName} --classNameeu.dnetlib.dhp.schema.oaf.Relation --hiveMetastoreUris${hiveMetastoreUris} + --numPartitions10000 From 72f0d88d6ce9536a2577b24b8c5d7242b5dc6e39 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 19 Oct 2022 14:18:42 +0200 Subject: [PATCH 18/30] formatted code --- .../DataciteToOAFTransformation.scala | 132 +++++++++--------- 1 file changed, 64 insertions(+), 68 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala index 29f5cb99c..a7ad9e2d6 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala @@ -24,11 +24,11 @@ import scala.io.Source object DataciteToOAFTransformation { case class HostedByMapType( - openaire_id: String, - datacite_name: String, - official_name: String, - similarity: Option[Float] - ) {} + openaire_id: String, + datacite_name: String, + official_name: String, + similarity: Option[Float] + ) {} val mapper = new ObjectMapper() @@ -47,12 +47,12 @@ object DataciteToOAFTransformation { } /** This method should skip record if json contains invalid text - * defined in file datacite_filter - * - * @param record : not parsed Datacite record - * @param json : parsed record - * @return True if the record should be skipped - */ + * defined in file datacite_filter + * + * @param record : not parsed Datacite record + * @param json : parsed record + * @return True if the record should be skipped + */ def skip_record(record: String, json: org.json4s.JValue): Boolean = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats datacite_filter.exists(f => record.contains(f)) || (json \\ "publisher") @@ -98,11 +98,10 @@ object DataciteToOAFTransformation { } - /** - * This utility method indicates whether the embargo date has been reached - * @param embargo_end_date - * @return True if the embargo date has been reached, false otherwise - */ + /** This utility method indicates whether the embargo date has been reached + * @param embargo_end_date + * @return True if the embargo date has been reached, false otherwise + */ def embargo_end(embargo_end_date: String): Boolean = { val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]")) val td = LocalDate.now() @@ -147,27 +146,27 @@ object DataciteToOAFTransformation { } } - /*** - * Use the vocabulary dnet:publication_resource to find a synonym to one of these terms and get the instance.type. - * Using the dnet:result_typologies vocabulary, we look up the instance.type synonym - * to generate one of the following main entities: - * - publication - * - dataset - * - software - * - otherresearchproduct - - * @param resourceType - * @param resourceTypeGeneral - * @param schemaOrg - * @param vocabularies - * @return - */ + /** * + * Use the vocabulary dnet:publication_resource to find a synonym to one of these terms and get the instance.type. + * Using the dnet:result_typologies vocabulary, we look up the instance.type synonym + * to generate one of the following main entities: + * - publication + * - dataset + * - software + * - otherresearchproduct + * + * @param resourceType + * @param resourceTypeGeneral + * @param schemaOrg + * @param vocabularies + * @return + */ def getTypeQualifier( - resourceType: String, - resourceTypeGeneral: String, - schemaOrg: String, - vocabularies: VocabularyGroup - ): (Qualifier, Qualifier) = { + resourceType: String, + resourceTypeGeneral: String, + schemaOrg: String, + vocabularies: VocabularyGroup + ): (Qualifier, Qualifier) = { if (resourceType != null && resourceType.nonEmpty) { val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType) @@ -212,11 +211,11 @@ object DataciteToOAFTransformation { } def getResult( - resourceType: String, - resourceTypeGeneral: String, - schemaOrg: String, - vocabularies: VocabularyGroup - ): Result = { + resourceType: String, + resourceTypeGeneral: String, + schemaOrg: String, + vocabularies: VocabularyGroup + ): Result = { val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies) if (typeQualifiers == null) @@ -258,11 +257,11 @@ object DataciteToOAFTransformation { } /** As describe in ticket #6377 - * when the result come from figshare we need to remove subject - * and set Access rights OPEN. - * - * @param r - */ + * when the result come from figshare we need to remove subject + * and set Access rights OPEN. + * + * @param r + */ def fix_figshare(r: Result): Unit = { if (r.getInstance() != null) { @@ -289,12 +288,12 @@ object DataciteToOAFTransformation { } def generateRelation( - sourceId: String, - targetId: String, - relClass: String, - cf: KeyValue, - di: DataInfo - ): Relation = { + sourceId: String, + targetId: String, + relClass: String, + cf: KeyValue, + di: DataInfo + ): Relation = { val r = new Relation r.setSource(sourceId) @@ -323,12 +322,12 @@ object DataciteToOAFTransformation { } def generateOAF( - input: String, - ts: Long, - dateOfCollection: Long, - vocabularies: VocabularyGroup, - exportLinks: Boolean - ): List[Oaf] = { + input: String, + ts: Long, + dateOfCollection: Long, + vocabularies: VocabularyGroup, + exportLinks: Boolean + ): List[Oaf] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json = parse(input) @@ -413,7 +412,6 @@ object DataciteToOAFTransformation { return List() result.setAuthor(authors.asJava) - val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List()) result.setTitle( @@ -437,8 +435,6 @@ object DataciteToOAFTransformation { .asJava ) - - val dates = (json \\ "dates").extract[List[DateType]] val publication_year = (json \\ "publicationYear").extractOrElse[String](null) @@ -645,16 +641,16 @@ object DataciteToOAFTransformation { } private def generateRelations( - rels: List[RelatedIdentifierType], - id: String, - date: String - ): List[Relation] = { + rels: List[RelatedIdentifierType], + id: String, + date: String + ): List[Relation] = { rels .filter(r => subRelTypeMapping .contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") || - r.relatedIdentifierType.equalsIgnoreCase("pmid") || - r.relatedIdentifierType.equalsIgnoreCase("arxiv")) + r.relatedIdentifierType.equalsIgnoreCase("pmid") || + r.relatedIdentifierType.equalsIgnoreCase("arxiv")) ) .map(r => { val rel = new Relation @@ -686,4 +682,4 @@ object DataciteToOAFTransformation { s"10|$b::${DHPUtils.md5(a)}" } -} \ No newline at end of file +} From bca4a61710ed7d394937a28973f44018d99aa2be Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 19 Oct 2022 15:20:58 +0200 Subject: [PATCH 19/30] suppressing hyper verbose spark logs during unit test execution --- .../src/test/resources/log4j.properties | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/log4j.properties diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/log4j.properties b/dhp-workflows/dhp-enrichment/src/test/resources/log4j.properties new file mode 100644 index 000000000..ce37270c6 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/log4j.properties @@ -0,0 +1,25 @@ +# Root logger option +log4j.rootLogger=DEBUG, stdout + +# Direct log messages to stdout +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.Target=System.out +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n + +# Change this to set Spark log level +log4j.logger.org.apache.spark=ERROR +log4j.rootCategory=WARN + +# Silence akka remoting +log4j.logger.Remoting=WARN + +# Ignore messages below warning level from Jetty, because it's a bit verbose +log4j.logger.org.eclipse.jetty=WARN + +log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitterFactory=WARN +log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter=WARN +log4j.logger.org.apache.parquet.hadoop.ParquetOutputFormat=WARN +log4j.logger.org.apache.parquet.hadoop.InternalParquetRecordWriter=WARN +log4j.logger.org.apache.hadoop.io.compress.CodecPool=WARN +log4j.logger.org.apache.parquet.hadoop.codec.CodecConfig=WARN \ No newline at end of file From 31a10f000b200d047ad50be1c0a376a7316d8be9 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Sun, 23 Oct 2022 18:05:37 +0200 Subject: [PATCH 20/30] Map the field oaf:eoscifguidelines from mdstores. Currently we can find it in ROHub metadata --- .../raw/AbstractMdRecordToOafMapper.java | 20 +++++++++++++++++++ .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 1 + .../eu/dnetlib/dhp/oa/graph/raw/rohub.xml | 5 ++++- 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index c157be51a..a33a04be3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -366,6 +366,7 @@ public abstract class AbstractMdRecordToOafMapper { r.setInstance(instances); r.setBestaccessright(OafMapperUtils.createBestAccessRights(instances)); + r.setEoscifguidelines(prepareEOSCIfGuidelines(doc, info)); } protected abstract List prepareResultPids(Document doc, DataInfo info); @@ -384,6 +385,25 @@ public abstract class AbstractMdRecordToOafMapper { return list; } + private List prepareEOSCIfGuidelines(Document doc, DataInfo info){ + final Set set = Sets.newHashSet(); + for (final Object o : doc.selectNodes("//oaf:eoscifguidelines")) { + final String code = ((Node) o).valueOf("@code"); + final String label = ((Node) o).valueOf("@label"); + final String url = ((Node) o).valueOf("@url"); + final String semrel = ((Node) o).valueOf("@semrel"); + if (StringUtils.isNotBlank(code)) { + final EoscIfGuidelines eig = new EoscIfGuidelines(); + eig.setCode(code); + eig.setLabel(label); + eig.setUrl(url); + eig.setSemanticRelation(semrel); + set.add(eig); + } + } + return Lists.newArrayList(set); + } + protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); protected abstract List prepareInstances( diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index f0eadbd0d..8c9b3caba 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -947,6 +947,7 @@ class MappersTest { Instance inst = p.getInstance().get(0); assertEquals("https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", inst.getPid().get(0).getValue()); assertEquals("https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", inst.getUrl().get(0)); + assertEquals(1, p.getEoscifguidelines().size()); } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub.xml index e1e30c3de..18f637ecc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub.xml @@ -65,7 +65,6 @@ Ecology - EOSC::RO-crate https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca @@ -75,5 +74,9 @@ + \ No newline at end of file From ee759ac92da0116f2f6c0c8b11aacce98e5a55a3 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Sun, 23 Oct 2022 18:09:47 +0200 Subject: [PATCH 21/30] file format after mvn compile --- .../dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index a33a04be3..02b1e7e7c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -385,7 +385,7 @@ public abstract class AbstractMdRecordToOafMapper { return list; } - private List prepareEOSCIfGuidelines(Document doc, DataInfo info){ + private List prepareEOSCIfGuidelines(Document doc, DataInfo info) { final Set set = Sets.newHashSet(); for (final Object o : doc.selectNodes("//oaf:eoscifguidelines")) { final String code = ((Node) o).valueOf("@code"); From 208ed323153a0b19189f6ddf65776f67a6df2e67 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Sun, 23 Oct 2022 18:18:13 +0200 Subject: [PATCH 22/30] fixed xpath for semantic relation --- .../dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java | 2 +- .../test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 02b1e7e7c..7aa40cb8a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -391,7 +391,7 @@ public abstract class AbstractMdRecordToOafMapper { final String code = ((Node) o).valueOf("@code"); final String label = ((Node) o).valueOf("@label"); final String url = ((Node) o).valueOf("@url"); - final String semrel = ((Node) o).valueOf("@semrel"); + final String semrel = ((Node) o).valueOf("@semanticrelation"); if (StringUtils.isNotBlank(code)) { final EoscIfGuidelines eig = new EoscIfGuidelines(); eig.setCode(code); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 8c9b3caba..ad733bec0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -948,6 +948,10 @@ class MappersTest { assertEquals("https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", inst.getPid().get(0).getValue()); assertEquals("https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", inst.getUrl().get(0)); assertEquals(1, p.getEoscifguidelines().size()); + assertEquals("EOSC::RO-crate", p.getEoscifguidelines().get(0).getCode()); + assertEquals("EOSC::RO-crate", p.getEoscifguidelines().get(0).getLabel()); + assertEquals("", p.getEoscifguidelines().get(0).getUrl()); + assertEquals("compliesWith", p.getEoscifguidelines().get(0).getSemanticRelation()); } From 2b9a20a4a378d887d801df3eb6958bd20b079276 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 24 Oct 2022 12:53:47 +0200 Subject: [PATCH 23/30] Changed the way Scholexplorer filter the relationships, I found that filter all relation coming from openCitation is wrong, because we loose a lot of relation than intersect OpenCitation, but they don't come only from there --- .../sx/graph/SparkConvertRDDtoDataset.scala | 77 ++++++++----------- 1 file changed, 34 insertions(+), 43 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala index 556106180..362cb2028 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala @@ -116,54 +116,45 @@ object SparkConvertRDDtoDataset { .map(s => mapper.readValue(s, classOf[Relation])) .filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference) .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) - .filter(r => filterRelations(subRelTypeFilter, relClassFilter, r)) - //filter OpenCitations relations - .filter(r => - r.getDataInfo.getProvenanceaction != null && - !"sysimport:crosswalk:opencitations".equals(r.getDataInfo.getProvenanceaction.getClassid) - ) + .filter(r => filterRelations(r)) + //filter OpenCitations relations +// .filter(r => +// r.getDataInfo.getProvenanceaction != null && +// !"sysimport:crosswalk:opencitations".equals(r.getDataInfo.getProvenanceaction.getClassid) +// ) spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") } - private def filterRelations(subRelTypeFilter: String, relClassFilter: List[String], r: Relation): Boolean = { - if (StringUtils.isNotBlank(subRelTypeFilter)) { - subRelTypeFilter.equalsIgnoreCase(r.getSubRelType) - } else { - !relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)) + private def filterRelations(r: Relation): Boolean = { + + /** * + * We filter relation generated by dedups + * and all the relation that have one single collectedFrom OpenCitation + */ + + val relClassFilter = List( + ModelConstants.MERGES, + ModelConstants.IS_MERGED_IN, + ModelConstants.HAS_AMONG_TOP_N_SIMILAR_DOCS, + ModelConstants.IS_AMONG_TOP_N_SIMILAR_DOCS + ) + if (relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))) + false + else { + if (r.getCollectedfrom == null || r.getCollectedfrom.size() == 0) + false + else if (r.getCollectedfrom.size() > 1) + true + else if ( + r.getCollectedfrom.size() == 1 && r.getCollectedfrom.get(0) != null && "OpenCitations".equalsIgnoreCase( + r.getCollectedfrom.get(0).getValue + ) + ) + false + else + true } } - /* - //TODO: finalise implementation - private def processResult[T<: Result]( - implicit ct: ClassTag[T], - log: Logger, - spark: SparkSession, - sourcePath: String, - entityPath: String, - clazz: Class[T] - ): Unit = { - val entityType = clazz.getSimpleName.toLowerCase - - log.info(s"Converting $entityType") - - val mapper = new ObjectMapper() with ScalaObjectMapper - mapper.registerModule(DefaultScalaModule) - - val rdd = spark.sparkContext - .textFile(s"$sourcePath/$entityType") - .map(s => mapper.readValue(s, clazz)) - .filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference); - - implicit val encoder: Encoder[T] = Encoders.kryo(clazz) - spark - .createDataset(rdd) - .as[T] - .write - .mode(SaveMode.Overwrite) - .save(s"$entityPath/$entityType") - } - */ - } From 5df9c6396336e895708d00642ebb229825dc7665 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Thu, 27 Oct 2022 16:44:26 +0300 Subject: [PATCH 24/30] Added fields: totalcost, fundedamount, currency, in project table --- .../dhp/oa/graph/stats/oozie_app/scripts/step11.sql | 4 +++- .../dhp/oa/graph/stats/oozie_app/scripts/step13.sql | 8 ++++---- .../dhp/oa/graph/stats/oozie_app/scripts/step6.sql | 8 ++++++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql index d699b68c3..41c3ed751 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql @@ -42,7 +42,9 @@ SELECT p.id, CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs, p.callidentifier, p.code, - p.totalcost + p.totalcost, + p.fundedamount, + p.currency FROM ${stats_db_name}.project_tmp p LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np FROM ${stats_db_name}.project_results pr diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index aee66fd5e..24e1a1355 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -59,7 +59,7 @@ UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; -create table ${stats_db_name}.result_orcid STORED AS PARQUET as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as select distinct res.id, regexp_replace(res.orcid, 'http://orcid.org/' ,'') as orcid from ( SELECT substr(res.id, 4) as id, auth_pid.value as orcid @@ -69,7 +69,7 @@ from ( LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; -create table ${stats_db_name}.result_result stored as parquet as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype from ${openaire_db_name}.relation rel join ${openaire_db_name}.result r1 on rel.source=r1.id @@ -82,7 +82,7 @@ where reltype='resultResult' and r2.resulttype.classname != 'other' and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; -create table ${stats_db_name}.result_citations_oc stored as parquet as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as select substr(target, 4) as id, count(distinct substr(source, 4)) as citations from ${openaire_db_name}.relation rel join ${openaire_db_name}.result r1 on rel.source=r1.id @@ -97,7 +97,7 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE group by substr(target, 4); -create table ${stats_db_name}.result_references_oc stored as parquet as +CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as select substr(source, 4) as id, count(distinct substr(target, 4)) as references from ${openaire_db_name}.relation rel join ${openaire_db_name}.result r1 on rel.source=r1.id diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index 5461afde6..c31180c14 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -48,7 +48,9 @@ CREATE TABLE ${stats_db_name}.project_tmp delayedpubs INT, callidentifier STRING, code STRING, - totalcost FLOAT + totalcost FLOAT, + fundedamount FLOAT, + currency STRING ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); INSERT INTO ${stats_db_name}.project_tmp @@ -72,7 +74,9 @@ SELECT substr(p.id, 4) AS id, 0 AS delayedpubs, p.callidentifier.value AS callidentifier, p.code.value AS code, - p.totalcost AS totalcost + p.totalcost AS totalcost, + p.fundedamount AS fundedamount, + p.currency.value AS currency FROM ${openaire_db_name}.project p WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; From 7861c472e0c8e4c3084a04721dafc736111c4963 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Fri, 28 Oct 2022 19:00:32 +0300 Subject: [PATCH 25/30] Hive memory parameters --- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml index 9331d4ac5..63fc84d75 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml @@ -21,7 +21,7 @@ hive_jdbc_url - jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=19166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=11596411699;spark.yarn.driver.memoryOverhead=1228 oozie.wf.workflow.notification.url From 2687fc9f73733d2b8486f44bbdba6b8130359551 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 22 Nov 2022 17:30:56 +0100 Subject: [PATCH 26/30] tests for EOSC Future review - ROhub --- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 11 ++ .../oa/graph/raw/photic-zone-transformed.xml | 108 +++++++++++++++++ .../dhp/oa/provision/EOSCFuture_Test.java | 88 ++++++++++++++ .../eosc-future/photic-zone-transformed.xml | 98 +++++++++++++++ .../oa/provision/eosc-future/photic-zone.json | 1 + .../eu/dnetlib/dhp/oa/provision/fields.xml | 112 ++++++++---------- 6 files changed, 357 insertions(+), 61 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/photic-zone-transformed.xml create mode 100644 dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/EOSCFuture_Test.java create mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone-transformed.xml create mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone.json diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index f0eadbd0d..184383f92 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -995,6 +995,17 @@ class MappersTest { } + @Test + void testEOSCFuture_ROHub() throws IOException { + final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("photic-zone-transformed.xml"))); + final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); + final OtherResearchProduct rocrate = (OtherResearchProduct) list.get(0); + assertNotNull(rocrate.getEoscifguidelines()); + System.out.println("***************"); + System.out.println(new ObjectMapper().writeValueAsString(rocrate)); + System.out.println("***************"); + } + @Test void testNotWellFormed() throws IOException { final String xml = IOUtils diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/photic-zone-transformed.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/photic-zone-transformed.xml new file mode 100644 index 000000000..22bf0577e --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/photic-zone-transformed.xml @@ -0,0 +1,108 @@ + + +
+ fsh_____4119::68126da991bd76d8be494bddfbf7a1bb + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be + + + + + + 2022-11-15T12:29:19Z + 2022-11-15T12:29:19Z + fsh_____4119 + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be + 2022-11-15T12:29:19Z + rohub_data + ro-crate_data +
+ + + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be + + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be + + + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/b1b617b2-6b79-4bae-9fa6-b76945645626 + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/78103994-30be-4875-bf89-5acd752b5c3d + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/18fd1c70-249b-4c67-80ee-539f801a0da7 + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/32faa2eb-4cc8-401f-ac5c-bec2849b70e1 + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/4c253f5a-d427-40c2-9e9f-6063ae087239 + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/371b1957-078c-472b-a195-af7bce152c10 + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/82f9e4b8-01b4-4e50-9e27-ec9d337c8d74 + + RO-crate + + Creative Commons Attribution 4.0 International + open access + + + Mapping the photic zone of the Mediterranean Sea + + + Estimating the penetration of light along the water column from satellite data to map the photic zone in the Mediterranean Sea + + CNR-ISMAR + + + Giorgio Castellan + + + Lorenzo Angeletti + + + Paolo Montagna + + + Marco Taviani + + + + 2022-11-14T16:32:45Z + + + Estimating the penetration of light along the water column from satellite data to map the photic zone in the Mediterranean Sea + + 2022 + + open access + + + 813.478 KB + + + Earth sciences + Ecology + Optics + + + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be + 0048 + 2022-11-14 + OPEN + https://creativecommons.org/licenses/by/4.0/legalcode + + + + + + + +
\ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/EOSCFuture_Test.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/EOSCFuture_Test.java new file mode 100644 index 000000000..08bf19fe4 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/EOSCFuture_Test.java @@ -0,0 +1,88 @@ +package eu.dnetlib.dhp.oa.provision; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; +import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; +import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory; +import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory; +import org.apache.commons.io.IOUtils; +import org.apache.solr.client.solrj.util.ClientUtils; +import org.apache.solr.common.SolrInputDocument; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.io.SAXReader; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import java.io.IOException; +import java.io.StringReader; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class EOSCFuture_Test { + + public static ObjectMapper OBJECT_MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + + public static final String VERSION = "2021-04-15T10:05:53Z"; + public static final String DSID = "b9ee796a-c49f-4473-a708-e7d67b84c16d_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl"; + + private ContextMapper contextMapper; + + @BeforeEach + public void setUp() { + contextMapper = new ContextMapper(); + } + + + @Test + public void testEOSC_ROHub() throws IOException, DocumentException, TransformerException { + + final ContextMapper contextMapper = new ContextMapper(); + + final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, + XmlConverterJob.schemaLocation); + + final OtherResearchProduct p = OBJECT_MAPPER + .readValue(IOUtils.toString(getClass().getResourceAsStream("eosc-future/photic-zone.json")), OtherResearchProduct.class); + + final String xml = xmlRecordFactory.build(new JoinedEntity<>(p)); + + assertNotNull(xml); + + final Document doc = new SAXReader().read(new StringReader(xml)); + + assertNotNull(doc); + System.out.println(doc.asXML()); + + + testRecordTransformation(xml); + } + + + private void testRecordTransformation(final String record) throws IOException, TransformerException { + final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml")); + final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl")); + + final String transformer = XmlIndexingJob.getLayoutTransformer("DMF", fields, xslt); + + final Transformer tr = SaxonTransformerFactory.newInstance(transformer); + + final String indexRecordXML = XmlIndexingJob.toIndexRecord(tr, record); + + final SolrInputDocument solrDoc = new StreamingInputDocumentFactory(VERSION, DSID) + .parseDocument(indexRecordXML); + + final String xmlDoc = ClientUtils.toXML(solrDoc); + + Assertions.assertNotNull(xmlDoc); + System.out.println(xmlDoc); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone-transformed.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone-transformed.xml new file mode 100644 index 000000000..79830b0f7 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone-transformed.xml @@ -0,0 +1,98 @@ + + +
+ https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be + 2022-11-15T12:29:19Z + rohub_data + ro-crate_data +
+ + + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be + + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be + + + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/b1b617b2-6b79-4bae-9fa6-b76945645626 + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/78103994-30be-4875-bf89-5acd752b5c3d + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/18fd1c70-249b-4c67-80ee-539f801a0da7 + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/32faa2eb-4cc8-401f-ac5c-bec2849b70e1 + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/4c253f5a-d427-40c2-9e9f-6063ae087239 + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/371b1957-078c-472b-a195-af7bce152c10 + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/82f9e4b8-01b4-4e50-9e27-ec9d337c8d74 + + RO-crate + + Creative Commons Attribution 4.0 International + open access + + + Mapping the photic zone of the Mediterranean Sea + + + Estimating the penetration of light along the water column from satellite data to map the photic zone in the Mediterranean Sea + + CNR-ISMAR + + + Giorgio Castellan + + + Lorenzo Angeletti + + + Paolo Montagna + + + Marco Taviani + + + + 2022-11-14T16:32:45Z + + + Estimating the penetration of light along the water column from satellite data to map the photic zone in the Mediterranean Sea + + 2022 + + open access + + + 813.478 KB + + + Earth sciences + Ecology + Optics + + + https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be + 0048 + 2022-11-14 + OPEN + https://creativecommons.org/licenses/by/4.0/legalcode + + + + + + + +
\ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone.json new file mode 100644 index 000000000..ffef2740a --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone.json @@ -0,0 +1 @@ +{"collectedfrom":[{"key":"10|fairsharing_::1b69ebedb522700034547abc5652ffac","value":"ROHub","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1669132109711,"id":"50|w3id________::68126da991bd76d8be494bddfbf7a1bb","originalId":["50|fsh_____4119::68126da991bd76d8be494bddfbf7a1bb","https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be"],"pid":[{"value":"https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be","qualifier":{"classid":"w3id","classname":"w3id.org","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"dateofcollection":"2022-11-15T12:29:19Z","dateoftransformation":"2022-11-15T12:29:19Z","extraInfo":[],"oaiprovenance":null,"processingchargeamount":null,"processingchargecurrency":null,"measures":null,"author":[{"fullname":"Giorgio Castellan","name":"","surname":"","rank":1,"pid":[],"affiliation":[]},{"fullname":"Lorenzo Angeletti","name":"","surname":"","rank":2,"pid":[],"affiliation":[]},{"fullname":"Paolo Montagna","name":"","surname":"","rank":3,"pid":[],"affiliation":[]},{"fullname":"Marco Taviani","name":"","surname":"","rank":4,"pid":[],"affiliation":[]}],"resulttype":{"classid":"other","classname":"other","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:languages","schemename":"dnet:languages"},"country":[],"subject":[{"value":"Earth sciences","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"Ecology","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"Optics","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[{"value":"Mapping the photic zone of the Mediterranean Sea","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"relevantdate":[{"value":"2022-11-14T16:32:45Z","qualifier":{"classid":"Issued","classname":"Issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"description":[{"value":"Estimating the penetration of light along the water column from satellite data to map the photic zone in the Mediterranean Sea","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"dateofacceptance":{"value":"2022-11-14","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"publisher":{"value":"CNR-ISMAR","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"embargoenddate":null,"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"RO-crate","classname":"RO-crate","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"coverage":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":[],"instance":[{"license":{"value":"https://creativecommons.org/licenses/by/4.0/legalcode","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes","openAccessRoute":null},"instancetype":{"classid":"0048","classname":"Research Object","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"hostedby":{"key":"10|fairsharing_::1b69ebedb522700034547abc5652ffac","value":"ROHub","dataInfo":null},"url":["https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be"],"distributionlocation":null,"collectedfrom":{"key":"10|fairsharing_::1b69ebedb522700034547abc5652ffac","value":"ROHub","dataInfo":null},"pid":[{"value":"https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be","qualifier":{"classid":"w3id","classname":"w3id.org","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"alternateIdentifier":[],"dateofacceptance":{"value":"2022-11-14","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"processingchargeamount":null,"processingchargecurrency":null,"refereed":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"measures":null}],"eoscifguidelines":null,"contactperson":[],"contactgroup":[],"tool":[]} diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml index 910a366f6..be2ee7b98 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml @@ -2,11 +2,11 @@ - - - - - + + + + + @@ -14,17 +14,16 @@ - - - - + + + - - - - - - + + + + + + @@ -34,18 +33,17 @@ - - - + + - - - + + + - + - + @@ -54,35 +52,36 @@ - + - + - + - - + + - + - - + + + - - - + + + @@ -94,26 +93,29 @@ - + - - + + + + + - - + + - + - + - - + + - + @@ -132,13 +134,15 @@ + - + + - + @@ -156,20 +160,6 @@ - - - - - - - - - - - - - - - + \ No newline at end of file From 2832117f232b682e1b20e8bd7854b95bee306a36 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 22 Nov 2022 18:01:12 +0100 Subject: [PATCH 27/30] added eoscifguidelines in test --- .../eosc-future/photic-zone-transformed.xml | 98 ------------------- .../oa/provision/eosc-future/photic-zone.json | 2 +- 2 files changed, 1 insertion(+), 99 deletions(-) delete mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone-transformed.xml diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone-transformed.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone-transformed.xml deleted file mode 100644 index 79830b0f7..000000000 --- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone-transformed.xml +++ /dev/null @@ -1,98 +0,0 @@ - - -
- https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be - 2022-11-15T12:29:19Z - rohub_data - ro-crate_data -
- - - https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be - - https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be - - - https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/b1b617b2-6b79-4bae-9fa6-b76945645626 - https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/78103994-30be-4875-bf89-5acd752b5c3d - https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/18fd1c70-249b-4c67-80ee-539f801a0da7 - https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/32faa2eb-4cc8-401f-ac5c-bec2849b70e1 - https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/4c253f5a-d427-40c2-9e9f-6063ae087239 - https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/371b1957-078c-472b-a195-af7bce152c10 - https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/82f9e4b8-01b4-4e50-9e27-ec9d337c8d74 - - RO-crate - - Creative Commons Attribution 4.0 International - open access - - - Mapping the photic zone of the Mediterranean Sea - - - Estimating the penetration of light along the water column from satellite data to map the photic zone in the Mediterranean Sea - - CNR-ISMAR - - - Giorgio Castellan - - - Lorenzo Angeletti - - - Paolo Montagna - - - Marco Taviani - - - - 2022-11-14T16:32:45Z - - - Estimating the penetration of light along the water column from satellite data to map the photic zone in the Mediterranean Sea - - 2022 - - open access - - - 813.478 KB - - - Earth sciences - Ecology - Optics - - - https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be - 0048 - 2022-11-14 - OPEN - https://creativecommons.org/licenses/by/4.0/legalcode - - - - - - - -
\ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone.json index ffef2740a..9729c6051 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone.json +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/photic-zone.json @@ -1 +1 @@ -{"collectedfrom":[{"key":"10|fairsharing_::1b69ebedb522700034547abc5652ffac","value":"ROHub","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1669132109711,"id":"50|w3id________::68126da991bd76d8be494bddfbf7a1bb","originalId":["50|fsh_____4119::68126da991bd76d8be494bddfbf7a1bb","https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be"],"pid":[{"value":"https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be","qualifier":{"classid":"w3id","classname":"w3id.org","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"dateofcollection":"2022-11-15T12:29:19Z","dateoftransformation":"2022-11-15T12:29:19Z","extraInfo":[],"oaiprovenance":null,"processingchargeamount":null,"processingchargecurrency":null,"measures":null,"author":[{"fullname":"Giorgio Castellan","name":"","surname":"","rank":1,"pid":[],"affiliation":[]},{"fullname":"Lorenzo Angeletti","name":"","surname":"","rank":2,"pid":[],"affiliation":[]},{"fullname":"Paolo Montagna","name":"","surname":"","rank":3,"pid":[],"affiliation":[]},{"fullname":"Marco Taviani","name":"","surname":"","rank":4,"pid":[],"affiliation":[]}],"resulttype":{"classid":"other","classname":"other","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:languages","schemename":"dnet:languages"},"country":[],"subject":[{"value":"Earth sciences","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"Ecology","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"Optics","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[{"value":"Mapping the photic zone of the Mediterranean Sea","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"relevantdate":[{"value":"2022-11-14T16:32:45Z","qualifier":{"classid":"Issued","classname":"Issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"description":[{"value":"Estimating the penetration of light along the water column from satellite data to map the photic zone in the Mediterranean Sea","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"dateofacceptance":{"value":"2022-11-14","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"publisher":{"value":"CNR-ISMAR","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"embargoenddate":null,"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"RO-crate","classname":"RO-crate","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"coverage":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":[],"instance":[{"license":{"value":"https://creativecommons.org/licenses/by/4.0/legalcode","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes","openAccessRoute":null},"instancetype":{"classid":"0048","classname":"Research Object","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"hostedby":{"key":"10|fairsharing_::1b69ebedb522700034547abc5652ffac","value":"ROHub","dataInfo":null},"url":["https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be"],"distributionlocation":null,"collectedfrom":{"key":"10|fairsharing_::1b69ebedb522700034547abc5652ffac","value":"ROHub","dataInfo":null},"pid":[{"value":"https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be","qualifier":{"classid":"w3id","classname":"w3id.org","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"alternateIdentifier":[],"dateofacceptance":{"value":"2022-11-14","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"processingchargeamount":null,"processingchargecurrency":null,"refereed":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"measures":null}],"eoscifguidelines":null,"contactperson":[],"contactgroup":[],"tool":[]} +{"collectedfrom":[{"key":"10|fairsharing_::1b69ebedb522700034547abc5652ffac","value":"ROHub","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1669134693781,"id":"50|w3id________::68126da991bd76d8be494bddfbf7a1bb","originalId":["50|fsh_____4119::68126da991bd76d8be494bddfbf7a1bb","https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be"],"pid":[{"value":"https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be","qualifier":{"classid":"w3id","classname":"w3id.org","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"dateofcollection":"2022-11-15T12:29:19Z","dateoftransformation":"2022-11-15T12:29:19Z","extraInfo":[],"oaiprovenance":null,"processingchargeamount":null,"processingchargecurrency":null,"measures":null,"author":[{"fullname":"Giorgio Castellan","name":"","surname":"","rank":1,"pid":[],"affiliation":[]},{"fullname":"Lorenzo Angeletti","name":"","surname":"","rank":2,"pid":[],"affiliation":[]},{"fullname":"Paolo Montagna","name":"","surname":"","rank":3,"pid":[],"affiliation":[]},{"fullname":"Marco Taviani","name":"","surname":"","rank":4,"pid":[],"affiliation":[]}],"resulttype":{"classid":"other","classname":"other","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:languages","schemename":"dnet:languages"},"country":[],"subject":[{"value":"Earth sciences","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"Ecology","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"value":"Optics","qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[{"value":"Mapping the photic zone of the Mediterranean Sea","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"relevantdate":[{"value":"2022-11-14T16:32:45Z","qualifier":{"classid":"Issued","classname":"Issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"description":[{"value":"Estimating the penetration of light along the water column from satellite data to map the photic zone in the Mediterranean Sea","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"dateofacceptance":{"value":"2022-11-14","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"publisher":{"value":"CNR-ISMAR","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"embargoenddate":null,"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"RO-crate","classname":"RO-crate","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"coverage":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":[],"instance":[{"license":{"value":"https://creativecommons.org/licenses/by/4.0/legalcode","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes","openAccessRoute":null},"instancetype":{"classid":"0048","classname":"Research Object","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"hostedby":{"key":"10|fairsharing_::1b69ebedb522700034547abc5652ffac","value":"ROHub","dataInfo":null},"url":["https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be"],"distributionlocation":null,"collectedfrom":{"key":"10|fairsharing_::1b69ebedb522700034547abc5652ffac","value":"ROHub","dataInfo":null},"pid":[{"value":"https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be","qualifier":{"classid":"w3id","classname":"w3id.org","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"alternateIdentifier":[],"dateofacceptance":{"value":"2022-11-14","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"processingchargeamount":null,"processingchargecurrency":null,"refereed":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"measures":null}],"eoscifguidelines":[{"code":"EOSC::Jupyter Notebook","label":"EOSC::Jupyter Notebook","url":"","semanticRelation":"compliesWith"},{"code":"EOSC::Data Cube","label":"EOSC::Data Cube","url":"","semanticRelation":"compliesWith"},{"code":"EOSC::RO-crate","label":"EOSC::RO-crate","url":"","semanticRelation":"compliesWith"}],"contactperson":[],"contactgroup":[],"tool":[]} \ No newline at end of file From a79c47522dbdd5e4af90fda3aaf52cb03de9a7f0 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 23 Nov 2022 10:17:49 +0100 Subject: [PATCH 28/30] updated ORCID datasource identifier --- .../eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java index f92040c24..ba7c7dd01 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -554,7 +554,7 @@ public class PublicationToOaf implements Serializable { private KeyValue createCollectedFrom() { KeyValue cf = new KeyValue(); cf.setValue(ModelConstants.ORCID.toUpperCase()); - cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a"); + cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "cd0f74b5955dc87fd0605745c4b49ee8"); return cf; } From 0e3edc501897bfdac52bfeda6316181b9c4d5ce3 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 23 Nov 2022 11:26:36 +0100 Subject: [PATCH 29/30] [Bulk Tag] fixed issue in verb name --- .../criteria/ContainsVerbIgnoreCase.java | 2 +- .../bulktag/criteria/EqualVerbIgnoreCase.java | 2 +- .../criteria/NotContainsVerbIgnoreCase.java | 2 +- .../criteria/NotEqualVerbIgnoreCase.java | 2 +- .../communityconfiguration/tagging_conf.xml | 66 +++++++++---------- 5 files changed, 37 insertions(+), 37 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java index a4a6f5663..501eb51b9 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.bulktag.criteria; import java.io.Serializable; -@VerbClass("contains_ignorecase") +@VerbClass("contains_caseinsensitive") public class ContainsVerbIgnoreCase implements Selection, Serializable { private String param; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java index c5f0ce070..1cd07755c 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.bulktag.criteria; import java.io.Serializable; -@VerbClass("equals_ignorecase") +@VerbClass("equals_caseinsensitive") public class EqualVerbIgnoreCase implements Selection, Serializable { private String param; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java index b21be83f0..e12b65a27 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.bulktag.criteria; import java.io.Serializable; -@VerbClass("not_contains_ignorecase") +@VerbClass("not_contains_caseinsensitive") public class NotContainsVerbIgnoreCase implements Selection, Serializable { private String param; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java index c6958a641..c1749621e 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.bulktag.criteria; import java.io.Serializable; -@VerbClass("not_equals_ignorecase") +@VerbClass("not_equals_caseinsensitive") public class NotEqualVerbIgnoreCase implements Selection, Serializable { private String param; diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml index 06c57511d..4e580edf5 100644 --- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml @@ -1193,7 +1193,7 @@ - {"criteria":[{"constraint":[{"verb":"equals_ignorecase","field":"subject","value":"ciencias de la comunicación"}, + {"criteria":[{"constraint":[{"verb":"equals_caseinsensitive","field":"subject","value":"ciencias de la comunicación"}, {"verb":"equals","field":"subject","value":"Miriam"}]}, {"constraint":[{"verb":"equals","field":"subject","value":"miriam"}]}]} @@ -1317,81 +1317,81 @@ opendoar____::358aee4cc897452c00244351e4d91f69 - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}}]} re3data_____::7b0ad08687b2c960d5aeef06f811d5e6 - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} driver______::bee53aa31dc2cbb538c10c2b65fa5824 - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} openaire____::437f4b072b1aa198adcbc35910ff3b98 - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} openaire____::081b82f96300b6a6e3d282bad31cb6e2 - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} openaire____::9e3be59865b2c1c335d32dae2fe7b254 - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} opendoar____::8b6dd7db9af49e67306feb59a8bdc52c - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} share_______::4719356ec8d7d55d3feb384ce879ad6c - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} share_______::bbd802baad85d1fd440f32a7a3a2c2b1 - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} opendoar____::6f4922f45568161a8cdf4ad2299f6d23 - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]}, - {"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]}, + {"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]} re3data_____::7980778c78fb4cf0fab13ce2159030dc - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCov"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCov"}]}]} re3data_____::978378def740bbf2bfb420de868c460b - {"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCov"}]}]} + {"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCov"}]}]} From 90c8f9cb6178cc7d257d4f3000e48d734a6c2b5d Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Wed, 23 Nov 2022 12:18:44 +0100 Subject: [PATCH 30/30] tests for EOSC Future --- .../provision/IndexRecordTransformerTest.java | 14 + .../eosc-future/software-justthink-claim.xml | 305 +++++++++++++ .../eosc-future/software-justthink.xml | 429 ++++++++++++++++++ 3 files changed, 748 insertions(+) create mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/software-justthink-claim.xml create mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/software-justthink.xml diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java index e0fbb2a2f..17c3cdb30 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java @@ -128,6 +128,20 @@ public class IndexRecordTransformerTest { testRecordTransformation(record); } + @Test + public void testForEOSCFutureSoftwareNotebook() throws IOException, TransformerException { + final String record = IOUtils + .toString(getClass().getResourceAsStream("eosc-future/software-justthink.xml")); + testRecordTransformation(record); + } + + @Test + public void testForEOSCFutureSoftwareNotebookClaim() throws IOException, TransformerException { + final String record = IOUtils + .toString(getClass().getResourceAsStream("eosc-future/software-justthink-claim.xml")); + testRecordTransformation(record); + } + @Test void testDoiUrlNormalization() throws MalformedURLException { diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/software-justthink-claim.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/software-justthink-claim.xml new file mode 100644 index 000000000..02089bb30 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/software-justthink-claim.xml @@ -0,0 +1,305 @@ + + +
+ od______2659::3801993ea8f970cfc991277160edf277 + 2022-08-08T03:06:13Z + under curation + +
+ + + + JUSThink + Alignment Analysis + Norman, Utku + Dinkar, Tanvi + Bruno, Barbara + Clavel, Chloé + + + + +

+ 1. Description +

+

This repository contains tools to automatically analyse how + participants align their use of task-specific referents in their + dialogue and actions for a collaborative learning activity, and how + it relates to the task success (i.e. their learning + outcomes and task performance).

+

As a use case, it processes data from a collaborative problem solving + activity named JUSThink [1, 2], i.e. + JUSThink Dialogue and Actions Corpus data set that is available from the + Zenodo Repository, DOI: 10.5281/zenodo.4627104, and reproduces the results and figures + in [3].

+

In brief:

+
    +
  1. JUSThink Dialogue and Actions Corpus contains + transcripts, event logs, and test responses of children aged 9 + through 12, as they participate in the JUSThink activity [1, 2] + in pairs of two, to solve a problem on graphs together.
  2. +
  3. The JUSThink activity and its study is first + described in [1], and elaborated with findings concerning the link + between children's learning, performance in the activity, and + perception of self, the other and the robot in [2].
  4. +
  5. Alignment analysis in our work [3] studies the participants' use of + expressions that are related to the task at hand, their follow up + actions of these expressions, and how it links to task success.
  6. +
+

+ 2. Publications +

+

If you use this work in an academic context, please cite the following + publications:

+
    +
  • +

    Norman*, U., Dinkar*, T., Bruno, B., & Clavel, C. (2022). + Studying Alignment in a Collaborative Learning Activity via + Automatic Methods: The Link Between What We Say and Do. Dialogue + & Discourse, 13(2), 1 - ;48. *Contributed equally to this + work. https://doi.org/10.5210/dad.2022.201

    +
  • +
  • +

    Norman, U., Dinkar, T., Bruno, B., & Clavel, C. (2021). + JUSThink Alignment Analysis. In Dialogue & Discourse + (v1.0.0, Vol. 13, Number 2, pp. 1 - ;48). Zenodo. https://doi.org/10.5281/zenodo.4675070

    +
  • +
+

+ 3. Content +

+

The tools provided in this repository consists of 7 Jupyter Notebooks + written in Python 3, and two additional external tools utilised by the + notebooks.

+

+ 3.1. Jupyter Notebooks +

+

We highlight that the notebooks up until the last (i.e. to test the + hypotheses (tools/7_test_the_hypotheses.ipynb)) present a general + pipeline to process event logs, test responses and transcripts to + extract measures of task performance, learning outcomes, and measures of + alignment.

+
    +
  1. Extract task performance (and other features) from the logs + (tools/1_extract_performance_and_other_features_from_logs.ipynb): + Extracts various measures of task behaviour from the logs, at + varying granularities of the activity (i.e. the whole corpus, task, + attempt, and turn levels). In later notebooks, we focus on one of + the features to estimate the task performance of a team: (minimum) + error.
  2. +
  3. Extract learning outcomes from the test responses + (tools/2_extract_learning_gain_from_test_responses.ipynb): Extracts + measures of learning outcomes from the responses to the pre-test and + the post-test. In later notebooks, we focus on one of the features + to estimate the learning outcome of a team: relative learning gain + [4]
  4. +
  5. Select and visualise a subset of teams for + transcription + (tools/3_visualise_transcribed_teams.ipynb): Visualises the + transcribed teams among the other teams in the feature space spanned + by task performance and learning outcome, as well as the + distribution of their number of attempts and turns.
  6. +
  7. Extract routines from transcripts + (tools/4_extract_routines_from_transcripts.ipynb) (uses dialign to + extract routines): Extracts routines of referring expressions that + are "fixed", i.e. become shared or established amongst + interlocutors.
  8. +
  9. Combine transcripts with logs + (tools/5_construct_the_corpus_by_combining_transcripts_with_logs.ipynb): + Merges transcripts with event logs to have a combined dialogue and + actions corpus, to be processed e.g. to detect follow-up + actions.
  10. +
  11. Recognise instructions and detect follow-up actions + (tools/6_recognise_instructions_detect_follow-up_actions.ipynb): + Extracts verbalised instruction such as "connect Mount Basel to + Montreux", and pairs them with the follow-up action that may + match (e.g. if the other connects Basel to Montreux) or + mismatch (e.g. if the other connects Basel to + Neuchatel) with the instruction.
  12. +
  13. Test the hypotheses in [3] (tools/7_test_the_hypotheses.ipynb) (uses + effsize to estimate effect size, specifically + Cliff's Delta): Considers each research questions and hypotheses + studied in [3] and generates the results in [3].
  14. +
+

+ 3.2. External Tools +

+
    +
  1. dialign + tool to extract routines, specifically Release 1.0 from dialign-1.0.zip:\n It extracts routine expressions that are + "shared" among the participants from transcripts. \n It is + used as an external module (in accordance with its CeCILL-B License, + see License).
  2. +
  3. effsize tool to compute estimators of effect + size.\n We specifically use it to compute Cliff's Delta, which + quantifies the amount difference between two groups of observations, + by computing the Cliff's Delta statistic.\n It is taken from + project DABEST (see License).
  4. +
+

+ 4. Research Questions and Hypotheses in [3] +

+
    +
  • RQ1 Lexical alignment: How do the interlocutors + use expressions related to the task? Is this associated + with task success?
      +
    • H1.1: Task-specific referents become + routine early for more successful teams.
    • +
    • H1.2: Hesitation phenomena are more likely + to occur in the vicinity of priming and establishment of + task-specific referents for more successful teams.
    • +
    +
  • +
  • RQ2 Behavioural alignment: How do the interlocutors + follow up these expressions with actions? Is this + associated with task success?
      +
    • H2.1: Instructions are more likely to be + followed by a corresponding action early in the dialogue for + more successful teams.
    • +
    • H2.2: When instructions are followed by a + corresponding or a different action, the action is more + likely to be in the vicinity of information management + phenomena for more successful teams.
    • +
    +
  • +
+

The RQs and Hs are addressed in the notebook for testing the hypotheses + (i.e. tools/7_test_the_hypotheses.ipynb).

+

+ Acknowledgements +

+

This project has received funding from the European Union's Horizon + 2020 research and innovation programme under grant agreement No 765955. + Namely, the ANIMATAS Project.

+

+ License +

+

The whole package is under MIT License, see the LICENSE + file.

+

Classes under the tools/effsize package were taken from + project DABEST, Copyright 2016-2020 Joses W. Ho. + These classes are licensed under the BSD 3-Clause Clear License. See + tools/effsize/LICENSE file for additional + details.

+

Classes under the tools/dialign-1.0 package were taken + from project dialign. These classes are licensed under the + CeCILL-B License. This package is used as an "external + module", see tools/dialign-1.0/LICENSE.txt for + additional details.

+
+ + + + Zenodo + + + + + + + + + + + + + + + + + + + oai:zenodo.org:4675070 + + oai:zenodo.org:4675070 + 10.5281/zenodo.4675070 + + + + false + false + 0.9 + + + + + + corda__h2020::c4515ebef538a734cf11f795347f5dac + 765955 + ANIMATAS + Advancing intuitive human-machine interaction with human-like + social capabilities for education in schools + + + + ec__________::EC::H2020 + + + + + + + + + + + + + https://zenodo.org/record/4675070 + + + +
+
+
+
+
diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/software-justthink.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/software-justthink.xml new file mode 100644 index 000000000..9c0f4ea7d --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/software-justthink.xml @@ -0,0 +1,429 @@ + + +
+ doi_dedup___::c054151b6a8c4f41c7acf160651a6503 + 2022-10-13T00:15:44+0000 + 2022-10-13T07:44:29.152Z +
+ + + + + + oai:zenodo.org:4675070 + 50|od______2659::3801993ea8f970cfc991277160edf277 + oai:zenodo.org:6974562 + 50|od______2659::9c87ff4a5e7710052b873088e7265072 + 10.5281/zenodo.4675069 + 10.5281/zenodo.4675070 + 10.5281/zenodo.6974562 + 10.5281/zenodo.4675069 + + + + + + JUSThink Alignment + Analysis + + Norman, Utku + Dinkar, Tanvi + Bruno, Barbara + Clavel, Chloé + 2022-08-08 + &lt;strong>1. Description&lt;/strong> This repository + contains&lt;strong> tools to automatically analyse how participants align + their use of task-specific referents in their dialogue and actions for a + collaborative learning activity, and how it relates to the task + success&lt;/strong> (i.e. their learning outcomes and task performance). As + a use case, it processes data from a collaborative problem solving activity + named JUSThink [1, 2], i.e. JUSThink Dialogue and Actions Corpus data set that + is available from the Zenodo Repository, DOI: 10.5281/zenodo.4627104, and + reproduces the results and figures in [3]. In brief: &lt;strong>JUSThink + Dialogue and Actions Corpus&lt;/strong> contains transcripts, event logs, + and test responses of children aged 9 through 12, as they participate in the + JUSThink activity [1, 2] in pairs of two, to solve a problem on graphs together. + &lt;strong>The JUSThink activity and its study&lt;/strong> is first + described in [1], and elaborated with findings concerning the link between + children's learning, performance in the activity, and perception of self, the + other and the robot in [2]. &lt;strong>Alignment analysis in our work + [3]&lt;/strong> studies the participants' use of expressions that are + related to the task at hand, their follow up actions of these expressions, and + how it links to task success. &lt;strong>Changes in Release + v1.1.0:&lt;/strong> updated with the publication information, finalized + paper structure, research questions and hypotheses as in the published article: + U. Norman*&lt;em>, &lt;/em>T. Dinkar*, B. Bruno, and C. Clavel, + "Studying Alignment in a Collaborative Learning Activity via Automatic Methods: + The Link Between What We Say and Do," Dialogue &amp;amp; Discourse, 13(2), + 1–48. *Contributed equally to this work. 10.5210/dad.2022.201. + &lt;strong>Full Changelog:&lt;/strong> + https://github.com/chili-epfl/justhink-alignment-analysis/compare/v1.0.0...v1.1.0 + &lt;strong>2. Publications&lt;/strong> If you use this work in an + academic context, please cite the following publications: Norman*, U., Dinkar*, + T., Bruno, B., &amp;amp; Clavel, C. (2022). Studying Alignment in a + Collaborative Learning Activity via Automatic Methods: The Link Between What We + Say and Do. Dialogue &amp;amp; Discourse, 13(2), 1–48. *Contributed equally + to this work. https://doi.org/10.5210/dad.2022.201 Norman, U., Dinkar, T., + Bruno, B., &amp;amp; Clavel, C. (2021). JUSThink Alignment Analysis. In + Dialogue &amp;amp; Discourse (v1.1.0, Vol. 13, Number 2, pp. 1–48). Zenodo. + https://doi.org/10.5281/zenodo.6974562 &lt;strong>3. Content&lt;/strong> + The tools provided in this repository consists of 7 Jupyter Notebooks written in + Python 3, and two additional external tools utilised by the notebooks. + &lt;strong>3.1. Jupyter Notebooks&lt;/strong> We highlight that the + notebooks up until the last (i.e. to test the hypotheses + (tools/7_test_the_hypotheses.ipynb)) present a general pipeline to process event + logs, test responses and transcripts to extract measures of task performance, + learning outcomes, and measures of alignment. &lt;strong>Extract task + performance (and other features) from the logs + &lt;/strong>(tools/1_extract_performance_and_other_features_from_logs.ipynb): + Extracts various measures of task behaviour from the logs, at varying + granularities of the activity (i.e. the whole corpus, task, attempt, and turn + levels). In later notebooks, we focus on one of the features to estimate the + task performance of a team: (minimum) error. &lt;strong>Extract learning + outcomes from the test responses&lt;/strong> + (tools/2_extract_learning_gain_from_test_responses.ipynb): Extracts measures of + learning outcomes from the responses to the pre-test and the post-test. In later + notebooks, we focus on one of the features to estimate the learning outcome of a + team: relative learning gain [4] &lt;strong>Select and visualise a subset of + teams for transcription&lt;/strong> + (tools/3_visualise_transcribed_teams.ipynb): Visualises the transcribed teams + among the other teams in the feature space spanned by task performance and + learning outcome, as well as the distribution of their number of attempts and + turns. &lt;strong>Extract routines from transcripts&lt;/strong> + (tools/4_extract_routines_from_transcripts.ipynb) (uses dialign to extract + routines): Extracts routines of referring expressions that are "fixed", i.e. + become shared or established amongst interlocutors. &lt;strong>Combine + transcripts with logs&lt;/strong> + (tools/5_construct_the_corpus_by_combining_transcripts_with_logs.ipynb): Merges + transcripts with event logs to have a combined dialogue and actions corpus, to + be processed e.g. to detect follow-up actions. &lt;strong>Recognise + instructions and detect follow-up actions&lt;/strong> + (tools/6_recognise_instructions_detect_follow-up_actions.ipynb): Extracts + verbalised instruction such as "connect Mount Basel to Montreux", and pairs them + with the follow-up action that may &lt;em>match&lt;/em> (e.g. if the + other connects Basel to Montreux) or &lt;em>mismatch&lt;/em> (e.g. if + the other connects Basel to Neuchatel) with the instruction. &lt;strong>Test + the hypotheses &lt;/strong>in [3] (tools/7_test_the_hypotheses.ipynb) (uses + &lt;strong>effsize&lt;/strong> to estimate effect size, specifically + Cliff's Delta): Considers each research questions and hypotheses studied in [3] + and generates the results in [3]. &lt;strong>3.2. External + Tools&lt;/strong> &lt;strong>dialign tool&lt;/strong> to extract + routines, specifically Release 1.0 from dialign-1.0.zip:&lt;br> It extracts + routine expressions that are "shared" among the participants from transcripts. + &lt;br> It is used as an external module (in accordance with its CeCILL-B + License, see &lt;strong>License&lt;/strong>). &lt;strong>effsize + tool&lt;/strong> to compute estimators of effect size.&lt;br> We + specifically use it to compute Cliff's Delta, which quantifies the amount + difference between two groups of observations, by computing the Cliff's Delta + statistic.&lt;br> It is taken from project DABEST (see + &lt;strong>License&lt;/strong>). &lt;strong>4. Research Questions + and Hypotheses in [3]&lt;/strong> &lt;strong>RQ1 Lexical + alignment&lt;/strong>: How do the interlocutors &lt;em>use&lt;/em> + expressions related to the task? Is this associated with task success? + &lt;strong>H1.1&lt;/strong>: Task-specific referents become routine + early for more successful teams. &lt;strong>H1.2&lt;/strong>: Hesitation + phenomena are more likely to occur in the vicinity of priming and establishment + of task-specific referents for more successful teams. &lt;strong>RQ2 + Behavioural alignment&lt;/strong>: How do the interlocutors + &lt;em>follow up&lt;/em> these expressions with actions? Is this + associated with task success? &lt;strong>H2.1&lt;/strong>: Instructions + are more likely to be followed by a corresponding action early in the dialogue + for more successful teams. &lt;strong>H2.2&lt;/strong>: When + instructions are followed by a corresponding or a different action, the action + is more likely to be in the vicinity of information management phenomena for + more successful teams. The RQs and Hs are addressed in the notebook for testing + the hypotheses (i.e. tools/7_test_the_hypotheses.ipynb). + &lt;strong>Acknowledgements&lt;/strong> This project has received + funding from the European Union's Horizon 2020 research and innovation programme + under grant agreement No 765955. Namely, the ANIMATAS Project. + &lt;strong>License&lt;/strong> The whole package is under MIT License, + see the &lt;strong>LICENSE&lt;/strong> file. Classes under the + &lt;strong>tools/effsize&lt;/strong> package were taken from project + &lt;strong>DABEST&lt;/strong>, Copyright 2016-2020 Joses W. Ho. These + classes are licensed under the BSD 3-Clause Clear License. See + &lt;strong>tools/effsize/LICENSE&lt;/strong> file for additional + details. Classes under the &lt;strong>tools/dialign-1.0&lt;/strong> + package were taken from project &lt;strong>dialign&lt;/strong>. These + classes are licensed under the CeCILL-B License. This package is used as an + "external module", see&lt;strong> + tools/dialign-1.0/LICENSE.txt&lt;/strong> for additional + details. + {"references": ["[1] J. Nasir, U. Norman, B. Bruno, and P. Dillenbourg, + \"You Tell, I Do, and We Swap until we Connect All the Gold Mines!,\" ERCIM + News, vol. 2020, no. 120, 2020, [Online]. Available: + https://ercim-news.ercim.eu/en120/special/you-tell-i-do-and-we-swap-until-we-connect-all-the-gold-mines", + "[2] J. Nasir*, U. Norman*, B. Bruno, and P. Dillenbourg, \"When Positive + Perception of the Robot Has No Effect on Learning,\" in 2020 29th IEEE + International Conference on Robot and Human Interactive Communication (RO-MAN), + Aug. 2020, pp. 313\u2013320, doi: 10.1109/RO-MAN47096.2020.9223343", "[3] U. + Norman*, T. Dinkar*, B. Bruno, and C. Clavel, \"Studying Alignment in a + Collaborative Learning Activity via Automatic Methods: The Link Between What We + Say and Do,\" Dialogue &amp;amp; Discourse, vol. 13, no. 2, pp. 1\u201348, + Aug. 2022, doi: 10.5210/dad.2022.201.", "[4] M. Sangin, G. Molinari, M.-A. + N\u00fcssli, and P. Dillenbourg, \"Facilitating peer knowledge modeling: Effects + of a knowledge awareness tool on collaborative learning outcomes and + processes,\"\" Computers in Human Behavior, vol. 27, no. 3, pp. 1059\u20131067, + May 2011, doi: 10.1016/j.chb.2010.05.032."]} + alignment + situated + dialogue + collaborative + learning + spontaneous + speech + disfluency + mutual + understanding + + 2021-04-09 + 2022-08-08 + Zenodo + + + + + + + + + + + true + false + 0.8 + dedup-result-decisiontree-v3 + + + + + doi_dedup___::ae235765bbc422195a6c9f632b2d77eb + + 2104.04429 + + arXiv + + 2022-08-05 + Studying + Alignment in a Collaborative Learning Activity via Automatic Methods: + The Link Between What We Say and Do + + + 10.48550/arxiv.2104.04429 + 10.5210/dad.2022.201 + + + corda__h2020::c4515ebef538a734cf11f795347f5dac + Advancing intuitive human-machine interaction with human-like social + capabilities for education in schools + 765955 + + + ec__________::EC::H2020 + ec__________::EC::H2020::MSCA-ITN-ETN + + ANIMATAS + + + doi_dedup___::0a6314b0ed275d915f5b57a259375691 + 2021-03-22 + Zenodo + 10.5281/zenodo.4627104 + JUSThink Dialogue and Actions Corpus + 10.5281/zenodo.4627103 + + + + + + + Zenodo + 10.5281/zenodo.4675070 + JUSThink Alignment Analysis + 2021-04-09 + + + + 2022-08-08 + Zenodo + 10.5281/zenodo.6974562 + + JUSThink Alignment Analysis (v1.1.0) + + + JUSThink + Alignment Analysis (v1.1.0) + 2022-08-08 + Zenodo + 10.5281/zenodo.4675069 + + + + + + + 2022-08-08 + + 10.5281/zenodo.4675069 + + https://opensource.org/licenses/MIT + + https://doi.org/10.5281/zenodo.4675069 + + + + + + + 2022-08-08 + + 10.5281/zenodo.6974562 + + https://opensource.org/licenses/MIT + + https://doi.org/10.5281/zenodo.6974562 + + + + + + + 2021-04-09 + + 10.5281/zenodo.4675070 + + https://opensource.org/licenses/MIT + + https://doi.org/10.5281/zenodo.4675070 + + + + + + +
+