From e4eac1d20bd8981939fe4efbbd94c26d680d6999 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 13 May 2022 11:01:33 +0200 Subject: [PATCH 1/2] [EOSC TAG] added code to remove EOSC Jupyter Notebook from subjects and put EOSC as classid in the qualifier --- .../java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java | 11 +++++++++-- .../dhp/sx/graph/SparkConvertRDDtoDataset.scala | 12 +++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java index 561e1d57e3..e8c79e11da 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java @@ -30,7 +30,7 @@ public class SparkEoscTag { public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static final Qualifier EOSC_QUALIFIER = OafMapperUtils .qualifier( - "eosc", + "EOSC", "European Open Science Cloud", ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES); public static final DataInfo EOSC_DATAINFO = OafMapperUtils @@ -95,7 +95,14 @@ public class SparkEoscTag { if (containsCriteriaNotebook(s)) { sbject.add(EOSC_NOTEBOOK); - + if (sbject.stream().anyMatch(sb -> sb.getValue().equals("EOSC Jupyter Notebook"))){ + sbject = sbject.stream().map(sb -> { + if (sb.getValue().equals("EOSC Jupyter Notebook")){ + return null; + } + return sb; + }).filter(Objects::nonNull).collect(Collectors.toList()); + } } if (containsCriteriaGalaxy(s)) { sbject.add(EOSC_GALAXY); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala index 7c3a212acc..bd970a5cf4 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala @@ -103,21 +103,19 @@ object SparkConvertRDDtoDataset { "IsAmongTopNSimilarDocuments" ) - val rddRelation = spark.sparkContext .textFile(s"$sourcePath/relation") .map(s => mapper.readValue(s, classOf[Relation])) .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false) .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) //filter OpenCitations relations - .filter(r => r.getCollectedfrom!= null && r.getCollectedfrom.size()>0 && !r.getCollectedfrom.asScala.exists(k => "opencitations".equalsIgnoreCase(k.getValue))) + .filter(r => + r.getCollectedfrom != null && r.getCollectedfrom.size() > 0 && !r.getCollectedfrom.asScala.exists(k => + "opencitations".equalsIgnoreCase(k.getValue) + ) + ) .filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))) spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") - - - - - } } From 0dc33ea391f033578e890a8335dc28d36b8128d8 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 16 May 2022 09:20:30 +0200 Subject: [PATCH 2/2] [openorgs] fixed parent/child query, using the correct semantic labels --- .../dnetlib/dhp/oa/graph/sql/queryParentChildRelsOpenOrgs.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryParentChildRelsOpenOrgs.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryParentChildRelsOpenOrgs.sql index 388fee3f58..0ac8434011 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryParentChildRelsOpenOrgs.sql +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryParentChildRelsOpenOrgs.sql @@ -10,4 +10,4 @@ SELECT 'OpenOrgs Database' AS collectedfromname, 'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction FROM relationships -WHERE reltype = 'Child' OR reltype = 'Parent' \ No newline at end of file +WHERE reltype = 'IsChildOf' OR reltype = 'IsParentOf' \ No newline at end of file