From e4eac1d20bd8981939fe4efbbd94c26d680d6999 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 13 May 2022 11:01:33 +0200 Subject: [PATCH] [EOSC TAG] added code to remove EOSC Jupyter Notebook from subjects and put EOSC as classid in the qualifier --- .../java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java | 11 +++++++++-- .../dhp/sx/graph/SparkConvertRDDtoDataset.scala | 12 +++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java index 561e1d57e..e8c79e11d 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java @@ -30,7 +30,7 @@ public class SparkEoscTag { public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static final Qualifier EOSC_QUALIFIER = OafMapperUtils .qualifier( - "eosc", + "EOSC", "European Open Science Cloud", ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES); public static final DataInfo EOSC_DATAINFO = OafMapperUtils @@ -95,7 +95,14 @@ public class SparkEoscTag { if (containsCriteriaNotebook(s)) { sbject.add(EOSC_NOTEBOOK); - + if (sbject.stream().anyMatch(sb -> sb.getValue().equals("EOSC Jupyter Notebook"))){ + sbject = sbject.stream().map(sb -> { + if (sb.getValue().equals("EOSC Jupyter Notebook")){ + return null; + } + return sb; + }).filter(Objects::nonNull).collect(Collectors.toList()); + } } if (containsCriteriaGalaxy(s)) { sbject.add(EOSC_GALAXY); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala index 7c3a212ac..bd970a5cf 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala @@ -103,21 +103,19 @@ object SparkConvertRDDtoDataset { "IsAmongTopNSimilarDocuments" ) - val rddRelation = spark.sparkContext .textFile(s"$sourcePath/relation") .map(s => mapper.readValue(s, classOf[Relation])) .filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false) .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) //filter OpenCitations relations - .filter(r => r.getCollectedfrom!= null && r.getCollectedfrom.size()>0 && !r.getCollectedfrom.asScala.exists(k => "opencitations".equalsIgnoreCase(k.getValue))) + .filter(r => + r.getCollectedfrom != null && r.getCollectedfrom.size() > 0 && !r.getCollectedfrom.asScala.exists(k => + "opencitations".equalsIgnoreCase(k.getValue) + ) + ) .filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))) spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") - - - - - } }