From 8a574fee2acec3e56c843c46c1681681472c6bdd Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 11 Oct 2022 11:41:19 +0200 Subject: [PATCH] [Dump] removing all EOSC related addition from master --- .../eu/dnetlib/dhp/ExecCreateSchemas.java | 3 - .../model/EoscInteroperabilityFramework.java | 67 ----- .../eu/dnetlib/dhp/eosc/model/EoscResult.java | 23 -- .../src/test/java/GenerateJsonSchema.java | 16 -- .../dhp/oa/graph/dump/ResultMapper.java | 33 +-- .../dump/eosc/SelectEoscRelationsStep2.java | 83 ------- .../dump/eosc/SelectEoscResultsJobStep1.java | 89 ------- .../dump/eosc_select_result_parameters.json | 30 --- .../eoscdump/oozie_app/config-default.xml | 30 --- .../dump/eoscdump/oozie_app/workflow.xml | 231 ------------------ .../dhp/oa/graph/dump/DumpJobTest.java | 43 +--- .../dhp/oa/graph/dump/eosc/eosctag.json | 1 - 12 files changed, 3 insertions(+), 646 deletions(-) delete mode 100644 dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscInteroperabilityFramework.java delete mode 100644 dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscResult.java delete mode 100644 dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscRelationsStep2.java delete mode 100644 dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java delete mode 100644 dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc_select_result_parameters.json delete mode 100644 dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eoscdump/oozie_app/config-default.xml delete mode 100644 dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eoscdump/oozie_app/workflow.xml delete mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/eosctag.json diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/ExecCreateSchemas.java b/dump-schema/src/main/java/eu/dnetlib/dhp/ExecCreateSchemas.java index 4adce65..a9a0c49 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/ExecCreateSchemas.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/ExecCreateSchemas.java @@ -12,7 +12,6 @@ import com.fasterxml.jackson.databind.SerializationFeature; import com.github.imifou.jsonschema.module.addon.AddonModule; import com.github.victools.jsonschema.generator.*; -import eu.dnetlib.dhp.eosc.model.EoscResult; import eu.dnetlib.dhp.oa.model.community.CommunityResult; import eu.dnetlib.dhp.oa.model.graph.*; @@ -70,7 +69,5 @@ public class ExecCreateSchemas { ecs.generate(CommunityResult.class, DIRECTORY, "community_result_schema.json"); - ecs.generate(EoscResult.class, DIRECTORY, "eosc_result_schema.json"); - } } diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscInteroperabilityFramework.java b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscInteroperabilityFramework.java deleted file mode 100644 index 8b92bb2..0000000 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscInteroperabilityFramework.java +++ /dev/null @@ -1,67 +0,0 @@ - -package eu.dnetlib.dhp.eosc.model; - -import java.io.Serializable; - -import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema; - -/** - * @author miriam.baglioni - * @Date 29/07/22 - */ -public class EoscInteroperabilityFramework implements Serializable { - @JsonSchema(description = "EOSC-IF label") - private String label; - - @JsonSchema( - description = "EOSC-IF local code. Later on it could be populated with a PID (e.g. DOI), but for the time being we stick to a more loose definition.") - private String code; - - @JsonSchema(description = "EOSC-IF url to the guidelines") - private String url; - - @JsonSchema(description = "EOSC-IF semantic relation (e.g. compliesWith)") - private String semanticRelation; - - public String getLabel() { - return label; - } - - public void setLabel(String label) { - this.label = label; - } - - public String getCode() { - return code; - } - - public void setCode(String code) { - this.code = code; - } - - public String getUrl() { - return url; - } - - public void setUrl(String url) { - this.url = url; - } - - public String getSemanticRelation() { - return semanticRelation; - } - - public void setSemanticRelation(String semanticRelation) { - this.semanticRelation = semanticRelation; - } - - public static EoscInteroperabilityFramework newInstance(String code, String label, String url, - String semanticRelation) { - EoscInteroperabilityFramework eif = new EoscInteroperabilityFramework(); - eif.label = label; - eif.code = code; - eif.url = url; - eif.semanticRelation = semanticRelation; - return eif; - } -} diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscResult.java b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscResult.java deleted file mode 100644 index 1bbc675..0000000 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscResult.java +++ /dev/null @@ -1,23 +0,0 @@ - -package eu.dnetlib.dhp.eosc.model; - -import com.github.imifou.jsonschema.module.addon.annotation.JsonSchema; - -import eu.dnetlib.dhp.oa.model.graph.GraphResult; - -/** - * @author miriam.baglioni - * @Date 29/07/22 - */ -public class EoscResult extends GraphResult { - @JsonSchema(description = "Describes a reference to the EOSC Interoperability Framework (IF) Guidelines") - private EoscInteroperabilityFramework eoscIF; - - public EoscInteroperabilityFramework getEoscIF() { - return eoscIF; - } - - public void setEoscIF(EoscInteroperabilityFramework eoscIF) { - this.eoscIF = eoscIF; - } -} diff --git a/dump-schema/src/test/java/GenerateJsonSchema.java b/dump-schema/src/test/java/GenerateJsonSchema.java index 7fe8076..6ac0e03 100644 --- a/dump-schema/src/test/java/GenerateJsonSchema.java +++ b/dump-schema/src/test/java/GenerateJsonSchema.java @@ -9,7 +9,6 @@ import com.github.imifou.jsonschema.module.addon.AddonModule; import com.github.victools.jsonschema.generator.*; import eu.dnetlib.dhp.ExecCreateSchemas; -import eu.dnetlib.dhp.eosc.model.EoscResult; import eu.dnetlib.dhp.oa.model.graph.GraphResult; //@Disabled @@ -46,22 +45,7 @@ class GenerateJsonSchema { System.out.println(jsonSchema.toString()); } - @Test - void generateSchema3() throws JsonProcessingException { - ObjectMapper objectMapper = new ObjectMapper(); - AddonModule module = new AddonModule(); - SchemaGeneratorConfigBuilder configBuilder = new SchemaGeneratorConfigBuilder(objectMapper, - SchemaVersion.DRAFT_7, OptionPreset.PLAIN_JSON) - .with(module) - .with(Option.SCHEMA_VERSION_INDICATOR) - .without(Option.NONPUBLIC_NONSTATIC_FIELDS_WITHOUT_GETTERS); - SchemaGeneratorConfig config = configBuilder.build(); - SchemaGenerator generator = new SchemaGenerator(config); - JsonNode jsonSchema = generator.generateSchema(EoscResult.class); - - System.out.println(new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(jsonSchema)); - } @Test void generateJsonSchema3() throws IOException { diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java index 9d8d216..94d2c36 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java @@ -9,12 +9,6 @@ import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.eosc.model.EoscInteroperabilityFramework; -import eu.dnetlib.dhp.eosc.model.EoscResult; -import eu.dnetlib.dhp.oa.graph.dump.eosc.SelectEoscResultsJobStep1; import eu.dnetlib.dhp.oa.graph.dump.exceptions.CardinalityTooHighException; import eu.dnetlib.dhp.oa.graph.dump.exceptions.NoAvailableEntityTypeException; import eu.dnetlib.dhp.oa.model.*; @@ -44,8 +38,6 @@ public class ResultMapper implements Serializable { Result out; if (Constants.DUMPTYPE.COMPLETE.getType().equals(dumpType)) { out = new GraphResult(); - } else if (Constants.DUMPTYPE.EOSC.getType().equals(dumpType)) { - out = new EoscResult(); } else { out = new CommunityResult(); } @@ -158,10 +150,6 @@ public class ResultMapper implements Serializable { ((GraphResult) out) .setInstance( oInst.get().stream().map(ResultMapper::getGraphInstance).collect(Collectors.toList())); - } else if (Constants.DUMPTYPE.EOSC.getType().equals(dumpType)) { - ((EoscResult) out) - .setInstance( - oInst.get().stream().map(ResultMapper::getGraphInstance).collect(Collectors.toList())); } else { ((CommunityResult) out) .setInstance( @@ -260,26 +248,7 @@ public class ResultMapper implements Serializable { out.setIndicators(indicators); } - if (Constants.DUMPTYPE.EOSC.getType().equals(dumpType)) { - if (Optional.ofNullable(input.getEoscifguidelines()).isPresent()) { - List gei = input.getEoscifguidelines(); - if (gei.size() > 1) { - throw new CardinalityTooHighException( - "EOSC IF in the result has cardinality greater than one. Change dump!"); - } - if (gei.size() == 1) { - - EoscIfGuidelines ifra = gei.get(0); - ((EoscResult) out) - .setEoscIF( - EoscInteroperabilityFramework - .newInstance( - ifra.getCode(), ifra.getLabel(), ifra.getUrl(), - ifra.getSemanticRelation())); - - } - } - } else if (!Constants.DUMPTYPE.COMPLETE.getType().equals(dumpType)) { + if (!Constants.DUMPTYPE.COMPLETE.getType().equals(dumpType)) { ((CommunityResult) out) .setCollectedfrom( input diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscRelationsStep2.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscRelationsStep2.java deleted file mode 100644 index 72422f4..0000000 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscRelationsStep2.java +++ /dev/null @@ -1,83 +0,0 @@ - -package eu.dnetlib.dhp.oa.graph.dump.eosc; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - -import java.io.Serializable; -import java.util.Optional; - -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.FilterFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SparkSession; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.graph.dump.Utils; -import eu.dnetlib.dhp.oa.model.graph.GraphResult; -import eu.dnetlib.dhp.schema.oaf.Relation; - -/** - * @author miriam.baglioni - * @Date 27/07/22 - */ -public class SelectEoscRelationsStep2 implements Serializable { - private static final Logger log = LoggerFactory.getLogger(SelectEoscRelationsStep2.class); - - public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils - .toString( - SelectEoscRelationsStep2.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/reletion_selection_parameters.json")); - - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); - - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - - final String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); - - final String resultPath = parser.get("resultPath"); - log.info("resultPath: {}", resultPath); - - SparkConf conf = new SparkConf(); - - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - Utils.removeOutputDir(spark, resultPath + "/relation"); - selectRelations(spark, inputPath, resultPath + "/relation", resultPath); - }); - } - - private static void selectRelations(SparkSession spark, String inputPath, String outputPath, String resultPath) { - Dataset results = Utils - .readPath(spark, resultPath + "/publication", GraphResult.class) - .union( - Utils - .readPath(spark, resultPath + "/dataset", GraphResult.class)) - .union( - Utils - .readPath(spark, resultPath + "/software", GraphResult.class)) - .union( - Utils - .readPath(spark, resultPath + "/otherresearchproduct", GraphResult.class)); - - Dataset relations = Utils - .readPath(spark, inputPath + "/relation", Relation.class) - .filter( - (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && - !r.getDataInfo().getInvisible()); - - } - -} diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java deleted file mode 100644 index 304d891..0000000 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java +++ /dev/null @@ -1,89 +0,0 @@ - -package eu.dnetlib.dhp.oa.graph.dump.eosc; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - -import java.io.Serializable; -import java.util.Optional; - -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.FilterFunction; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.eosc.model.EoscResult; -import eu.dnetlib.dhp.oa.graph.dump.Constants; -import eu.dnetlib.dhp.oa.graph.dump.ResultMapper; -import eu.dnetlib.dhp.oa.graph.dump.Utils; -import eu.dnetlib.dhp.oa.model.graph.GraphResult; -import eu.dnetlib.dhp.schema.oaf.Result; - -/** - * @author miriam.baglioni - * @Date 27/07/22 - */ -public class SelectEoscResultsJobStep1 implements Serializable { - private static final Logger log = LoggerFactory.getLogger(SelectEoscResultsJobStep1.class); - - public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils - .toString( - SelectEoscResultsJobStep1.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/eosc_select_result_parameters.json")); - - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); - - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - - final String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); - - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); - - final String resultClassName = parser.get("resultTableName"); - log.info("resultTableName: {}", resultClassName); - - Class inputClazz = (Class) Class.forName(resultClassName); - - SparkConf conf = new SparkConf(); - - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - Utils.removeOutputDir(spark, outputPath); - selectEoscResults(spark, inputPath, outputPath, inputClazz); - }); - } - - private static void selectEoscResults(SparkSession spark, String inputPath, String outputPath, - Class inputClazz) { - Utils - .readPath(spark, inputPath, inputClazz) - .filter( - (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && !r.getDataInfo().getInvisible() - && r.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))) - .map( - (MapFunction) r -> (EoscResult) ResultMapper - .map(r, null, Constants.DUMPTYPE.EOSC.getType()), - Encoders.bean(EoscResult.class)) - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .json(outputPath); - } - -} diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc_select_result_parameters.json b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc_select_result_parameters.json deleted file mode 100644 index a59a5ce..0000000 --- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc_select_result_parameters.json +++ /dev/null @@ -1,30 +0,0 @@ -[ - - { - "paramName":"s", - "paramLongName":"sourcePath", - "paramDescription": "the path of the sequencial file to read", - "paramRequired": true - }, - { - "paramName": "out", - "paramLongName": "outputPath", - "paramDescription": "the path used to store temporary output files", - "paramRequired": true - }, - { - "paramName": "ssm", - "paramLongName": "isSparkSessionManaged", - "paramDescription": "true if the spark session is managed, false otherwise", - "paramRequired": false - }, - { - "paramName":"tn", - "paramLongName":"resultTableName", - "paramDescription": "the name of the result table we are currently working on", - "paramRequired": true - } -] - - - diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eoscdump/oozie_app/config-default.xml b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eoscdump/oozie_app/config-default.xml deleted file mode 100644 index d262cb6..0000000 --- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eoscdump/oozie_app/config-default.xml +++ /dev/null @@ -1,30 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - hiveMetastoreUris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - hiveJdbcUrl - jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 - - - hiveDbName - openaire - - - oozie.launcher.mapreduce.user.classpath.first - true - - diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eoscdump/oozie_app/workflow.xml b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eoscdump/oozie_app/workflow.xml deleted file mode 100644 index de85e94..0000000 --- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eoscdump/oozie_app/workflow.xml +++ /dev/null @@ -1,231 +0,0 @@ - - - - sourcePath - the source path - - - outputPath - the output path - - - accessToken - the access token used for the deposition in Zenodo - - - connectionUrl - the connection url for Zenodo - - - metadata - the metadata associated to the deposition - - - depositionType - the type of deposition we want to perform. "new" for brand new deposition, "version" for a new version of a published deposition (in this case the concept record id must be provided), "upload" to upload content to an open deposition for which we already have the deposition id (in this case the deposition id should be provided) - - - conceptRecordId - for new version, the id of the record for the old deposition - - - depositionId - the depositionId of a deposition open that has to be added content - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - oozieActionShareLibForSpark2 - oozie action sharelib for spark 2.* - - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - - - spark2YarnHistoryServerAddress - spark 2.* yarn history server address - - - spark2EventLogDir - spark 2.* event log dir location - - - - ${jobTracker} - ${nameNode} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - oozie.action.sharelib.for.spark - ${oozieActionShareLibForSpark2} - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - yarn - cluster - Dump Publication For EOSC - eu.dnetlib.dhp.oa.graph.dump.eosc.SelectEoscResultsJobStep1 - dump-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/publication - --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication - --outputPath${workingDir}/tar/publication - - - - - - - - yarn - cluster - Dump Dataset For EOSC - eu.dnetlib.dhp.oa.graph.dump.eosc.SelectEoscResultsJobStep1 - dump-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/dataset - --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset - --outputPath${workingDir}/tar/dataset - - - - - - - - yarn - cluster - Dump ORP For EOSC - eu.dnetlib.dhp.oa.graph.dump.eosc.SelectEoscResultsJobStep1 - dump-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/otherresearchproduct - --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath${workingDir}/tar/otherresearchproduct - - - - - - - - yarn - cluster - Dump Software For EOSC - eu.dnetlib.dhp.oa.graph.dump.eosc.SelectEoscResultsJobStep1 - dump-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/software - --resultTableNameeu.dnetlib.dhp.schema.oaf.Software - --outputPath${workingDir}/tar/software - - - - - - - - - - - - eu.dnetlib.dhp.oa.graph.dump.MakeTar - --hdfsPath${outputPath} - --nameNode${nameNode} - --sourcePath${workingDir}/tar - - - - - - - - eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS - --hdfsPath${outputPath} - --nameNode${nameNode} - --accessToken${accessToken} - --connectionUrl${connectionUrl} - --metadata${metadata} - --conceptRecordId${conceptRecordId} - --depositionType${depositionType} - --depositionId${depositionId} - - - - - - \ No newline at end of file diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/DumpJobTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/DumpJobTest.java index 11a65d0..a45e317 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/DumpJobTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/DumpJobTest.java @@ -25,9 +25,9 @@ import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.gson.Gson; -import eu.dnetlib.dhp.eosc.model.EoscResult; + import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap; -import eu.dnetlib.dhp.oa.graph.dump.eosc.SelectEoscResultsJobStep1; + import eu.dnetlib.dhp.oa.model.Instance; import eu.dnetlib.dhp.oa.model.OpenAccessRoute; import eu.dnetlib.dhp.oa.model.community.CommunityResult; @@ -1096,46 +1096,7 @@ public class DumpJobTest { } - @Test - public void testEOSCDump() throws Exception { - final String sourcePath = getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/eosc/eosctag.json") - .getPath(); - final String communityMapPath = getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json") - .getPath(); - - SelectEoscResultsJobStep1 - .main( - new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-sourcePath", - sourcePath, - "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", - "-outputPath", workingDir.toString() + "/working" - - }); - - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - - JavaRDD tmp = sc - .textFile(workingDir.toString() + "/working") - .map(item -> OBJECT_MAPPER.readValue(item, EoscResult.class)); - - org.apache.spark.sql.Dataset verificationDataset = spark - .createDataset(tmp.rdd(), Encoders.bean(EoscResult.class)); - - Assertions.assertEquals(1, verificationDataset.count()); - - Assertions.assertEquals(1, verificationDataset.filter("type = 'dataset'").count()); - - Assertions.assertEquals(1, tmp.filter(d -> d.getEoscIF().getCode().equals("EOSC::Twitter Data")).count()); - Assertions.assertEquals(1, tmp.filter(d -> d.getEoscIF().getLabel().equals("EOSC::Twitter Data")).count()); - Assertions.assertEquals(1, tmp.filter(d -> d.getEoscIF().getUrl().equals("")).count()); - Assertions.assertEquals(1, tmp.filter(d -> d.getEoscIF().getSemanticRelation().equals("compliesWith")).count()); - - } @Test public void testArticlePCA() { diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/eosctag.json b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/eosctag.json deleted file mode 100644 index cd17fa7..0000000 --- a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/eosctag.json +++ /dev/null @@ -1 +0,0 @@ -{"geolocation": [], "dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "resourcetype": {"classid": "UNKNOWN", "classname": "Unknown", "schemeid": "dnet:dataCite_resource", "schemename": "dnet:dataCite_resource"}, "pid": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.21227/mqmt-yq28"}], "contributor": [], "bestaccessright": {"classid": "UNKNOWN", "classname": "not available", "schemeid": "dnet:access_modes", "schemename": "dnet:access_modes"}, "relevantdate": [{"qualifier": {"classid": "issued", "classname": "issued", "schemeid": "dnet:dataCite_date", "schemename": "dnet:dataCite_date"}, "value": "2020-11-21"}], "collectedfrom": [{"key": "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254", "value": "Datacite"}], "id": "50|doi_________::bbf3a8925017a575215fc7be77cab114", "subject": [{"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "COVID-19"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "Machine Learning"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "Corona Tweets Dataset"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "COVID-19 Tweets Dataset"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "Corona Tweets"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "COVID-19 Tweets"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "Corona Twitter Sentiment"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "COVID-19 Twitter Sentiment"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "SARS-CoV-2 Tweets Dataset"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "SARS-CoV-2 Twitter Sentiment"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "Coronavirus English Tweets Dataset"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "COVID-19 English Tweets Dataset"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "Coronavirus Geotagged Tweets"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "COVID-19 Geotagged Tweets"}, {"dataInfo": {"provenanceaction": {"classid": "iis", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "iis::document_classes", "invisible": false, "trust": "0.891"}, "qualifier": {"classid": "ACM", "classname": "ACM Computing Classification System", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "InformationSystems_MISCELLANEOUS"}, {"dataInfo": {"provenanceaction": {"classid": "iis", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "iis::document_classes", "invisible": false, "trust": "0.8226"}, "qualifier": {"classid": "ACM", "classname": "ACM Computing Classification System", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "InformationSystems_INFORMATIONSTORAGEANDRETRIEVAL"}], "lastupdatetimestamp": 1657046634922, "author": [{"surname": "Lamsal", "name": "Rabindra", "pid": [], "rank": 1, "affiliation": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "value": "School of Computer and Systems Sciences, JN"}], "fullname": "Lamsal, Rabindra"}], "instance": [{"refereed": {"classid": "0000", "classname": "UNKNOWN", "schemeid": "dnet:review_levels", "schemename": "dnet:review_levels"}, "hostedby": {"key": "10|re3data_____::3bc31eb6c47d0134a1ac576dc028c3b9", "value": "IEEE DataPort"}, "license": {"value": "https://creativecommons.org/licenses/by/4.0/legalcode"}, "url": ["https://dx.doi.org/10.21227/mqmt-yq28"], "pid": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.21227/mqmt-yq28"}], "dateofacceptance": {"value": "2020-11-21"}, "collectedfrom": {"key": "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254", "value": "Datacite"}, "accessright": {"classid": "UNKNOWN", "classname": "not available", "schemeid": "dnet:access_modes", "schemename": "dnet:access_modes"}, "instancetype": {"classid": "0021", "classname": "Dataset", "schemeid": "dnet:publication_resource", "schemename": "dnet:publication_resource"}}], "dateofcollection": "2020-11-21T04:46:12+0000", "fulltext": [], "dateoftransformation": "2020-11-21T04:46:12+0000", "description": [{"value": "This dataset contains IDs and sentiment scores of the geo-tagged tweets related to the COVID-19 pandemic. The tweets are captured by an on-going project deployed at https://live.rlamsal.com.np. The model monitors the real-time Twitter feed for coronavirus-related tweets using 90+ different keywords and hashtags that are commonly used while referencing the pandemic. Complying with Twitter's content redistribution policy, only the tweet IDs are shared. You can re-construct the dataset by hydrating these IDs. The tweet IDs in this dataset belong to the tweets tweeted providing an exact location.The paper associated with this dataset is available here: Design and analysis of a large-scale COVID-19 tweets dataset"}], "format": [], "measures": [], "coverage": [], "externalReference": [], "publisher": {"value": "IEEE DataPort"}, "context": [{"dataInfo": [{"provenanceaction": {"classid": "iis", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "iis::document_covid19", "invisible": false, "trust": "0.9"}, {"provenanceaction": {"classid": "community:subject", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "bulktagging", "invisible": false, "trust": "0.8"}, {"provenanceaction": {"classid": "community:datasource", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "bulktagging", "invisible": false, "trust": "0.8"}], "id": "covid-19"}, {"dataInfo": [{"provenanceaction": {"classid": "community:datasource", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "bulktagging", "invisible": false, "trust": "0.8"}], "id": "eosc"}], "eoscifguidelines": [{"semanticRelation": "compliesWith", "url": "", "code": "EOSC::Twitter Data", "label": "EOSC::Twitter Data"}], "language": {"classid": "und", "classname": "Undetermined", "schemeid": "dnet:languages", "schemename": "dnet:languages"}, "resulttype": {"classid": "dataset", "classname": "dataset", "schemeid": "dnet:result_typologies", "schemename": "dnet:result_typologies"}, "country": [], "extraInfo": [], "originalId": ["10.21227/mqmt-yq28"], "source": [], "dateofacceptance": {"value": "2020-11-21"}, "title": [{"qualifier": {"classid": "main title", "classname": "main title", "schemeid": "dnet:dataCite_title", "schemename": "dnet:dataCite_title"}, "value": "Coronavirus (COVID-19) Geo-tagged Tweets Dataset"}]} \ No newline at end of file