From 6dc68c48e0d8720b875173692eab09d9e46e77cb Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 21 Apr 2022 16:19:04 +0200 Subject: [PATCH] [EOSCTag] - --- .../eu/dnetlib/dhp/bulktag/SparkEoscTag.java | 61 +++++++++++++++++-- 1 file changed, 57 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java index 2b44275cd3..7dc3b5878b 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java @@ -7,11 +7,18 @@ import eu.dnetlib.dhp.bulktag.community.CommunityConfiguration; import eu.dnetlib.dhp.bulktag.community.CommunityConfigurationFactory; import eu.dnetlib.dhp.bulktag.community.ProtoMap; import eu.dnetlib.dhp.bulktag.community.QueryInformationSystem; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,6 +32,12 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; public class SparkEoscTag { private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + public final static StructuredProperty EOSC_NOTEBOOK = OafMapperUtils.structuredProperty( + "EOSC::Jupyter Notebook", OafMapperUtils.qualifier("eosc","European Open Science Cloud", + ModelConstants.DNET_SUBJECT_TYPOLOGIES,ModelConstants.DNET_SUBJECT_TYPOLOGIES) + ,OafMapperUtils.dataInfo(false, "propagation", true, false, + OafMapperUtils.qualifier("propagation:subject","Inferred by OpenAIRE", + ModelConstants.DNET_PROVENANCE_ACTIONS,ModelConstants.DNET_PROVENANCE_ACTIONS), "0.9")); public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils @@ -45,23 +58,63 @@ public class SparkEoscTag { final String inputPath = parser.get("sourcePath"); log.info("inputPath: {}", inputPath); + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + SparkConf conf = new SparkConf(); runWithSparkSession( conf, isSparkSessionManaged, spark -> { - execEoscTag(spark, inputPath); + execEoscTag(spark, inputPath, workingPath); }); } - private static void execEoscTag(SparkSession spark, String inputPath) { + private static void execEoscTag(SparkSession spark, String inputPath, String workingPath) { //search for notebook //subject contiene jupyter. //esistono python e notebook nei subject non necessariamente nello stesso - //si cerca fra i prodotto di tipo software - Dataset sw = readPath(spark, inputPath + "/software", Software.class) + //si cerca fra i prodotti di tipo software + + + readPath(spark, inputPath + "/software", Software.class) + .map((MapFunction) s -> { + if(containsSubjectNotebook(s)){ + s.getSubject().add(EOSC_NOTEBOOK); + } + return s; + }, Encoders.bean(Software.class) ) + .write() + .mode(SaveMode.Overwrite) + .option("compression","gzip") + .json(workingPath + "/software"); + + readPath(spark, workingPath + "/software" , Software.class) + .write() + .mode(SaveMode.Overwrite) + .option("compression","gzip") + .json(inputPath + "/software"); } + + private static boolean containsSubjectNotebook(Software s) { + if(s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter"))) + return true; + if(s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("python") && + sbj.getValue().toLowerCase().contains("notebook"))) + return true; + if(s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("python")) && + s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("notebook"))) + return true; + return false; + } + + private static boolean containsTitleNotebook(Software s) { + if (s.getTitle().stream().anyMatch(t -> t.getValue().toLowerCase().contains("jupyter") && + t.getValue().toLowerCase().contains("notebook"))) + return true; + return false; + } }