From 58bc3f223ad1e180113fae6909c8656a8b85ee68 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 2 Dec 2021 14:09:46 +0100 Subject: [PATCH] [GRAPH DUMP] Add filtering for relation we do not want to dump. It is based on the relclass --- .../dump/complete/CreateContextEntities.java | 2 +- .../dump/complete/CreateContextRelation.java | 2 +- .../dump/complete/SparkCollectAndSave.java | 2 +- .../dump/complete/SparkDumpEntitiesJob.java | 2 +- .../dump/complete/SparkDumpRelationJob.java | 17 +- .../complete/SparkOrganizationRelation.java | 2 +- .../SparkSelectValidRelationsJob.java | 2 +- .../community/oozie_app/config-default.xml | 30 - .../dump/community/oozie_app/workflow.xml | 431 ------------- .../complete/oozie_app/config-default.xml | 30 - .../dump/complete/oozie_app/workflow.xml | 586 ------------------ .../community_infrastructure_schema.json | 37 -- .../complete/schema/datasource_schema.json | 192 ------ .../complete/schema/organization_schema.json | 57 -- .../dump/complete/schema/project_schema.json | 119 ---- .../dump/complete/schema/relation_schema.json | 60 -- .../dump/complete/schema/result_schema.json | 398 ------------ .../oozie_app/config-default.xml | 30 - .../dump/funderresults/oozie_app/workflow.xml | 563 ----------------- .../input_collect_and_save.json | 0 ...rs.json => input_complete_parameters.json} | 0 .../input_entity_parameter.json | 0 .../input_organization_parameters.json | 0 .../input_relationdump_parameters.json | 6 + .../community_infrastructure_schema.json | 35 -- .../graph/dump/schemas/datasource_schema.json | 192 ------ .../dump/schemas/organization_schema.json | 57 -- .../oa/graph/dump/schemas/project_schema.json | 119 ---- .../graph/dump/schemas/relation_schema.json | 68 -- .../oa/graph/dump/schemas/result_schema.json | 417 ------------- .../graph/dump/complete/DumpRelationTest.java | 61 +- 31 files changed, 84 insertions(+), 3433 deletions(-) delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/community/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/community/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/community_infrastructure_schema.json delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/datasource_schema.json delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/organization_schema.json delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/project_schema.json delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/relation_schema.json delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/result_schema.json delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funderresults/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funderresults/oozie_app/workflow.xml rename dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/{complete => }/input_collect_and_save.json (100%) rename dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/{complete/input_parameters.json => input_complete_parameters.json} (100%) rename dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/{complete => }/input_entity_parameter.json (100%) rename dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/{complete => }/input_organization_parameters.json (100%) rename dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/{complete => }/input_relationdump_parameters.json (73%) delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/community_infrastructure_schema.json delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/datasource_schema.json delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/organization_schema.json delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/project_schema.json delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/relation_schema.json delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/result_schema.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/CreateContextEntities.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/CreateContextEntities.java index 120de9327..aa031203d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/CreateContextEntities.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/CreateContextEntities.java @@ -41,7 +41,7 @@ public class CreateContextEntities implements Serializable { .toString( CreateContextEntities.class .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/complete/input_entity_parameter.json")); + "/eu/dnetlib/dhp/oa/graph/dump/input_entity_parameter.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/CreateContextRelation.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/CreateContextRelation.java index a468e334d..3c71fcaa5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/CreateContextRelation.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/CreateContextRelation.java @@ -48,7 +48,7 @@ public class CreateContextRelation implements Serializable { .requireNonNull( CreateContextRelation.class .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/complete/input_entity_parameter.json"))); + "/eu/dnetlib/dhp/oa/graph/dump/input_entity_parameter.json"))); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkCollectAndSave.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkCollectAndSave.java index 671bccd25..e902a02c8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkCollectAndSave.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkCollectAndSave.java @@ -31,7 +31,7 @@ public class SparkCollectAndSave implements Serializable { .toString( SparkCollectAndSave.class .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/complete/input_collect_and_save.json")); + "/eu/dnetlib/dhp/oa/graph/dump/input_collect_and_save.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkDumpEntitiesJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkDumpEntitiesJob.java index 8b282386f..f239ffca7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkDumpEntitiesJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkDumpEntitiesJob.java @@ -22,7 +22,7 @@ public class SparkDumpEntitiesJob implements Serializable { .toString( SparkDumpEntitiesJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/complete/input_parameters.json")); + "/eu/dnetlib/dhp/oa/graph/dump/wf/input_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkDumpRelationJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkDumpRelationJob.java index ddfd6592f..05e3f8835 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkDumpRelationJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkDumpRelationJob.java @@ -4,10 +4,11 @@ package eu.dnetlib.dhp.oa.graph.dump.complete; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.Serializable; -import java.util.Optional; +import java.util.*; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -37,7 +38,7 @@ public class SparkDumpRelationJob implements Serializable { .toString( SparkDumpRelationJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/complete/input_relationdump_parameters.json")); + "/eu/dnetlib/dhp/oa/graph/dump/input_relationdump_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); @@ -54,6 +55,13 @@ public class SparkDumpRelationJob implements Serializable { final String outputPath = parser.get("outputPath"); log.info("outputPath: {}", outputPath); + Optional rs = Optional.ofNullable(parser.get("removeSet")); + final Set removeSet = new HashSet<>(); + if(rs.isPresent()){ + Collections.addAll(removeSet, rs.get().split(";")); + } + + SparkConf conf = new SparkConf(); runWithSparkSession( @@ -61,15 +69,16 @@ public class SparkDumpRelationJob implements Serializable { isSparkSessionManaged, spark -> { Utils.removeOutputDir(spark, outputPath); - dumpRelation(spark, inputPath, outputPath); + dumpRelation(spark, inputPath, outputPath, removeSet); }); } - private static void dumpRelation(SparkSession spark, String inputPath, String outputPath) { + private static void dumpRelation(SparkSession spark, String inputPath, String outputPath, Set removeSet) { Dataset relations = Utils.readPath(spark, inputPath, Relation.class); relations + .filter((FilterFunction)r -> !removeSet.contains(r.getRelClass())) .map((MapFunction) relation -> { eu.dnetlib.dhp.schema.dump.oaf.graph.Relation relNew = new eu.dnetlib.dhp.schema.dump.oaf.graph.Relation(); relNew diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkOrganizationRelation.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkOrganizationRelation.java index f9d2123e2..4475232de 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkOrganizationRelation.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkOrganizationRelation.java @@ -39,7 +39,7 @@ public class SparkOrganizationRelation implements Serializable { .toString( SparkOrganizationRelation.class .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/complete/input_organization_parameters.json")); + "/eu/dnetlib/dhp/oa/graph/dump/input_organization_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkSelectValidRelationsJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkSelectValidRelationsJob.java index 20f3fc4a7..d1bca6a7c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkSelectValidRelationsJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/complete/SparkSelectValidRelationsJob.java @@ -35,7 +35,7 @@ public class SparkSelectValidRelationsJob implements Serializable { .toString( SparkSelectValidRelationsJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/complete/input_relationdump_parameters.json")); + "/eu/dnetlib/dhp/oa/graph/dump/input_relationdump_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/community/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/community/oozie_app/config-default.xml deleted file mode 100644 index e5ec3d0ae..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/community/oozie_app/config-default.xml +++ /dev/null @@ -1,30 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - hiveMetastoreUris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - hiveJdbcUrl - jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 - - - hiveDbName - openaire - - - oozie.launcher.mapreduce.user.classpath.first - true - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/community/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/community/oozie_app/workflow.xml deleted file mode 100644 index fcef2547a..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/community/oozie_app/workflow.xml +++ /dev/null @@ -1,431 +0,0 @@ - - - - - sourcePath - the source path - - - isLookUpUrl - the isLookup service endpoint - - - outputPath - the output path - - - accessToken - the access token used for the deposition in Zenodo - - - connectionUrl - the connection url for Zenodo - - - metadata - the metadata associated to the deposition - - - depositionType - one among {new, update, version} - - - conceptRecordId - for new version, the id of the record for the old deposition - - - hiveDbName - the target hive database name - - - hiveJdbcUrl - hive server jdbc url - - - hiveMetastoreUris - hive server metastore URIs - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - oozieActionShareLibForSpark2 - oozie action sharelib for spark 2.* - - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - - - spark2YarnHistoryServerAddress - spark 2.* yarn history server address - - - spark2EventLogDir - spark 2.* event log dir location - - - - - ${jobTracker} - ${nameNode} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - oozie.action.sharelib.for.spark - ${oozieActionShareLibForSpark2} - - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - eu.dnetlib.dhp.oa.graph.dump.SaveCommunityMap - --outputPath${workingDir}/communityMap - --nameNode${nameNode} - --isLookUpUrl${isLookUpUrl} - - - - - - - - - - - - - - - yarn - cluster - Dump table publication for community related products - eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/publication - --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication - --outputPath${workingDir}/publication - --communityMapPath${workingDir}/communityMap - - - - - - - - yarn - cluster - Dump table dataset for community related products - eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/dataset - --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset - --outputPath${workingDir}/dataset - --communityMapPath${workingDir}/communityMap - - - - - - - - yarn - cluster - Dump table ORP for community related products - eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/otherresearchproduct - --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath${workingDir}/otherresearchproduct - --communityMapPath${workingDir}/communityMap - - - - - - - - yarn - cluster - Dump table software for community related products - eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/software - --resultTableNameeu.dnetlib.dhp.schema.oaf.Software - --outputPath${workingDir}/software - --communityMapPath${workingDir}/communityMap - - - - - - - - - - yarn - cluster - Prepare association result subset of project info - eu.dnetlib.dhp.oa.graph.dump.community.SparkPrepareResultProject - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath} - --outputPath${workingDir}/preparedInfo - - - - - - - - - - - - - - - yarn - cluster - Extend dumped publications with information about project - eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/publication - --outputPath${workingDir}/ext/publication - --preparedInfoPath${workingDir}/preparedInfo - - - - - - - - yarn - cluster - Extend dumped dataset with information about project - eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/dataset - --outputPath${workingDir}/ext/dataset - --preparedInfoPath${workingDir}/preparedInfo - - - - - - - - yarn - cluster - Extend dumped ORP with information about project - eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/otherresearchproduct - --outputPath${workingDir}/ext/orp - --preparedInfoPath${workingDir}/preparedInfo - - - - - - - - yarn - cluster - Extend dumped software with information about project - eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/software - --outputPath${workingDir}/ext/software - --preparedInfoPath${workingDir}/preparedInfo - - - - - - - - - - yarn - cluster - Split dumped result for community - eu.dnetlib.dhp.oa.graph.dump.community.SparkSplitForCommunity - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/ext - --outputPath${workingDir}/split - --communityMapPath${workingDir}/communityMap - - - - - - - - eu.dnetlib.dhp.oa.graph.dump.MakeTar - --hdfsPath${outputPath} - --nameNode${nameNode} - --sourcePath${workingDir}/split - - - - - - - - eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS - --hdfsPath${outputPath} - --nameNode${nameNode} - --accessToken${accessToken} - --connectionUrl${connectionUrl} - --metadata${metadata} - --communityMapPath${workingDir}/communityMap - --conceptRecordId${conceptRecordId} - --depositionId${depositionId} - --depositionType${depositionType} - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/oozie_app/config-default.xml deleted file mode 100644 index e5ec3d0ae..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/oozie_app/config-default.xml +++ /dev/null @@ -1,30 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - hiveMetastoreUris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - hiveJdbcUrl - jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 - - - hiveDbName - openaire - - - oozie.launcher.mapreduce.user.classpath.first - true - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/oozie_app/workflow.xml deleted file mode 100644 index 8189e2594..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/oozie_app/workflow.xml +++ /dev/null @@ -1,586 +0,0 @@ - - - - - sourcePath - the source path - - - isLookUpUrl - the isLookup service endpoint - - - outputPath - the output path - - - resultAggregation - true if all the result type have to be dumped under result. false otherwise - - - accessToken - the access token used for the deposition in Zenodo - - - connectionUrl - the connection url for Zenodo - - - metadata - the metadata associated to the deposition - - - depositionType - the type of deposition we want to perform. "new" for brand new deposition, "version" for a new version of a published deposition (in this case the concept record id must be provided), "upload" to upload content to an open deposition for which we already have the deposition id (in this case the deposition id should be provided) - - - conceptRecordId - for new version, the id of the record for the old deposition - - - depositionId - the depositionId of a deposition open that has to be added content - - - organizationCommunityMap - the organization community map - - - - hiveDbName - the target hive database name - - - hiveJdbcUrl - hive server jdbc url - - - hiveMetastoreUris - hive server metastore URIs - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - oozieActionShareLibForSpark2 - oozie action sharelib for spark 2.* - - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - - - spark2YarnHistoryServerAddress - spark 2.* yarn history server address - - - spark2EventLogDir - spark 2.* event log dir location - - - - - ${jobTracker} - ${nameNode} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - oozie.action.sharelib.for.spark - ${oozieActionShareLibForSpark2} - - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - eu.dnetlib.dhp.oa.graph.dump.SaveCommunityMap - --outputPath${workingDir}/communityMap - --nameNode${nameNode} - --isLookUpUrl${isLookUpUrl} - - - - - - - - - - - - - - - - - - - yarn - cluster - Dump table publication - eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/publication - --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication - --outputPath${workingDir}/result/publication - --communityMapPath${workingDir}/communityMap - - - - - - - - yarn - cluster - Dump table dataset - eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/dataset - --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset - --outputPath${workingDir}/result/dataset - --communityMapPath${workingDir}/communityMap - - - - - - - - yarn - cluster - Dump table ORP - eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/otherresearchproduct - --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath${workingDir}/result/otherresearchproduct - --communityMapPath${workingDir}/communityMap - - - - - - - - yarn - cluster - Dump table software - eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/software - --resultTableNameeu.dnetlib.dhp.schema.oaf.Software - --outputPath${workingDir}/result/software - --communityMapPath${workingDir}/communityMap - - - - - - - - yarn - cluster - Dump table organization - eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/organization - --resultTableNameeu.dnetlib.dhp.schema.oaf.Organization - --outputPath${workingDir}/collect/organization - --communityMapPath${workingDir}/communityMap - - - - - - - - yarn - cluster - Dump table project - eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/project - --resultTableNameeu.dnetlib.dhp.schema.oaf.Project - --outputPath${workingDir}/collect/project - --communityMapPath${workingDir}/communityMap - - - - - - - - yarn - cluster - Dump table datasource - eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/datasource - --resultTableNameeu.dnetlib.dhp.schema.oaf.Datasource - --outputPath${workingDir}/collect/datasource - --communityMapPath${workingDir}/communityMap - - - - - - - - yarn - cluster - Dump table relation - eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpRelationJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/relation - --outputPath${workingDir}/relation/relation - - - - - - - - - - - - - - - - eu.dnetlib.dhp.oa.graph.dump.complete.CreateContextEntities - --hdfsPath${workingDir}/collect/communities_infrastructures/communities_infrastructure.json.gz - --nameNode${nameNode} - --isLookUpUrl${isLookUpUrl} - - - - - - - - eu.dnetlib.dhp.oa.graph.dump.complete.CreateContextRelation - --hdfsPath${workingDir}/relation/context - --nameNode${nameNode} - --isLookUpUrl${isLookUpUrl} - - - - - - - - yarn - cluster - Dump table relation - eu.dnetlib.dhp.oa.graph.dump.complete.SparkOrganizationRelation - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/relation - --outputPath${workingDir}/relation/contextOrg - --organizationCommunityMap${organizationCommunityMap} - --communityMapPath${workingDir}/communityMap - - - - - - - - - - - - - - - - - yarn - cluster - Extract Relations from publication - eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/publication - --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication - --outputPath${workingDir}/relation/publication - --communityMapPath${workingDir}/communityMap - - - - - - - - yarn - cluster - Dump table dataset - eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/dataset - --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset - --outputPath${workingDir}/relation/dataset - --communityMapPath${workingDir}/communityMap - - - - - - - - yarn - cluster - Dump table ORP - eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/otherresearchproduct - --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath${workingDir}/relation/orp - --communityMapPath${workingDir}/communityMap - - - - - - - - yarn - cluster - Dump table software - eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/software - --resultTableNameeu.dnetlib.dhp.schema.oaf.Software - --outputPath${workingDir}/relation/software - --communityMapPath${workingDir}/communityMap - - - - - - - - - - yarn - cluster - Collect Results and Relations and put them in the right path - eu.dnetlib.dhp.oa.graph.dump.complete.SparkCollectAndSave - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir} - --outputPath${workingDir}/collect - --resultAggregation${resultAggregation} - - - - - - - - eu.dnetlib.dhp.oa.graph.dump.MakeTar - --hdfsPath${outputPath} - --nameNode${nameNode} - --sourcePath${workingDir}/collect - - - - - - - - eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS - --hdfsPath${outputPath} - --nameNode${nameNode} - --accessToken${accessToken} - --connectionUrl${connectionUrl} - --metadata${metadata} - --communityMapPath${workingDir}/communityMap - --conceptRecordId${conceptRecordId} - --depositionType${depositionType} - --depositionId${depositionId} - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/community_infrastructure_schema.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/community_infrastructure_schema.json deleted file mode 100644 index d2f179212..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/community_infrastructure_schema.json +++ /dev/null @@ -1,37 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "properties": { - "description": { - "type": "string", - "description": "Description of the research community/research infrastructure" - }, - "id": { - "type": "string", - "description": "OpenAIRE id of the research community/research infrastructure" - }, - "name": { - "type": "string", - "description": "The long name of the community" - }, - "originalId": { - "type": "string", - "description": "The acronym of the community" - }, - "subject": { - "description": "Only for research communities: the list of the subjects associated to the research community", - "type": "array", - "items": { - "type": "string" - } - }, - "type": { - "type": "string", - "description": "One of {Research Community, Research infrastructure}" - }, - "zenodo_community": { - "type": "string", - "description": "The URL of the Zenodo community associated to the Research community/Research infrastructure" - } - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/datasource_schema.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/datasource_schema.json deleted file mode 100644 index b9c15d921..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/datasource_schema.json +++ /dev/null @@ -1,192 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "definitions": { - "ControlledField": { - "type": "object", - "properties": { - "scheme": { - "type": "string" - }, - "value": { - "type": "string" - } - }, - "description": "To represent the information described by a scheme and a value in that scheme (i.e. pid)" - } - }, - "type": "object", - "properties": { - "accessrights": { - "type": "string", - "description": "Type of access to the data source, as defined by re3data.org. Possible values: {open, restricted, closed}" - }, - "certificates": { - "type": "string", - "description": "The certificate, seal or standard the data source complies with. As defined by re3data.org." - }, - "citationguidelineurl": { - "type": "string", - "description":"The URL of the data source providing information on how to cite its items. As defined by re3data.org." - }, - "contenttypes": { - "description": "Types of content in the data source, as defined by OpenDOAR", - "type": "array", - "items": { - "type": "string" - } - }, - "databaseaccessrestriction": { - "type": "string", - "description": "Access restrinctions to the data source, as defined by re3data.org. One of {feeRequired, registration, other}" - }, - "datasourcetype": { - "allOf": [ - { - "$ref": "#/definitions/ControlledField" - }, - { - "description": "The type of the datasource. See https://api.openaire.eu/vocabularies/dnet:datasource_typologies" - } - ] - }, - "datauploadrestriction": { - "type": "string", - "description": "Upload restrictions applied by the datasource, as defined by re3data.org. One of {feeRequired, registration, other}" - }, - "dateofvalidation": { - "type": "string", - "description": "The date of last validation against the OpenAIRE guidelines for the datasource records" - }, - "description": { - "type": "string" - }, - "englishname": { - "type": "string", - "description": "The English name of the datasource" - }, - "id": { - "type": "string", - "description": "The OpenAIRE id of the data source" - }, - "journal": { - "type": "object", - "properties": { - "conferencedate": { - "type": "string" - }, - "conferenceplace": { - "type": "string" - }, - "edition": { - "type": "string" - }, - "ep": { - "type": "string", - "description": "End page" - }, - "iss": { - "type": "string", - "description": "Issue number" - }, - "issnLinking": { - "type": "string" - }, - "issnOnline": { - "type": "string" - }, - "issnPrinted": { - "type": "string" - }, - "name": { - "type": "string" - }, - "sp": { - "type": "string", - "description": "Start page" - }, - "vol": { - "type": "string", - "description": "Volume" - } - }, - "description": "Information about the journal, if this data source is of type Journal." - }, - "languages": { - "description": "The languages present in the data source's content, as defined by OpenDOAR.", - "type": "array", - "items": { - "type": "string" - } - }, - "logourl": { - "type": "string" - }, - "missionstatementurl": { - "type": "string", - "description":"The URL of a mission statement describing the designated community of the data source. As defined by re3data.org" - }, - "officialname": { - "type": "string", - "description": "The official name of the datasource" - }, - "openairecompatibility": { - "type": "string", - "description": "OpenAIRE guidelines the data source comply with. See also https://guidelines.openaire.eu." - }, - "originalId": { - "description": "Original identifiers for the datasource" - "type": "array", - "items": { - "type": "string" - } - }, - "pid": { - "description": "Persistent identifiers of the datasource", - "type": "array", - "items": { - "allOf": [ - { - "$ref": "#/definitions/ControlledField" - } - ] - } - }, - "pidsystems": { - "type": "string", - "description": "The persistent identifier system that is used by the data source. As defined by re3data.org" - }, - "policies": { - "description": "Policies of the data source, as defined in OpenDOAR.", - "type": "array", - "items": { - "type": "string" - } - }, - "releaseenddate": { - "type": "string", - "description": "Date when the data source went offline or stopped ingesting new research data. As defined by re3data.org" - }, - "releasestartdate": { - "type": "string", - "description": "Releasing date of the data source, as defined by re3data.org" - }, - "subjects": { - "description": "List of subjects associated to the datasource", - "type": "array", - "items": { - "type": "string" - } - }, - "uploadrights": { - "type": "string", - "description": "Type of data upload. As defined by re3data.org: one of {open, restricted,closed}" - }, - "versioning": { - "type": "boolean", - "description": "As defined by redata.org: 'yes' if the data source supports versioning, 'no' otherwise." - }, - "websiteurl": { - "type": "string" - } - } -} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/organization_schema.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/organization_schema.json deleted file mode 100644 index 16afa386d..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/organization_schema.json +++ /dev/null @@ -1,57 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "properties": { - "alternativenames": { - "description": "Alternative names that identify the organisation", - "type": "array", - "items": { - "type": "string" - } - }, - "country": { - "type": "object", - "properties": { - "code": { - "type": "string", - "description": "The organisation country code" - }, - "label": { - "type": "string", - "description": "The organisation country label" - } - }, - "description": "The country of the organisation" - }, - "id": { - "type": "string", - "description": "The OpenAIRE id for the organisation" - }, - "legalname": { - "type": "string" - }, - "legalshortname": { - "type": "string" - }, - "pid": { - "description": "Persistent identifiers for the organisation i.e. isni 0000000090326370", - "type": "array", - "items": { - "type": "object", - "properties": { - "scheme": { - "type": "string", - "description": "The scheme of the identifier (i.e. isni)" - }, - "value": { - "type": "string", - "description": "the value in the schema (i.e. 0000000090326370)" - } - } - } - }, - "websiteurl": { - "type": "string" - } - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/project_schema.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/project_schema.json deleted file mode 100644 index c81187258..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/project_schema.json +++ /dev/null @@ -1,119 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "properties": { - "acronym": { - "type": "string" - }, - "callidentifier": { - "type": "string" - }, - "code": { - "type": "string", - "description": "The grant agreement number" - }, - "enddate": { - "type": "string" - }, - "funding": { - "description": "Funding information for the project", - "type": "array", - "items": { - "type": "object", - "properties": { - "funding_stream": { - "type": "object", - "properties": { - "description": { - "type": "string", - "description": "Description of the funding stream" - }, - "id": { - "type": "string", - "description": "Id of the funding stream" - } - } - }, - "jurisdiction": { - "type": "string", - "description": "The jurisdiction of the funder (i.e. EU)" - }, - "name": { - "type": "string", - "description": "The name of the funder (European Commission)" - }, - "shortName": { - "type": "string", - "description": "The short name of the funder (EC)" - } - } - } - }, - "granted": { - "type": "object", - "properties": { - "currency": { - "type": "string", - "description": "The currency of the granted amount (e.g. EUR)" - }, - "fundedamount": { - "type": "number", - "description": "The funded amount" - }, - "totalcost": { - "type": "number", - "description": "The total cost of the project" - } - }, - "description": "The money granted to the project" - }, - "h2020programme": { - "description": "The h2020 programme funding the project", - "type": "array", - "items": { - "type": "object", - "properties": { - "code": { - "type": "string", - "description": "The code of the programme" - }, - "description": { - "type": "string", - "description": "The description of the programme" - } - } - } - }, - "id": { - "type": "string", - "description": "OpenAIRE id for the project" - }, - "keywords": { - "type": "string" - }, - "openaccessmandatefordataset": { - "type": "boolean" - }, - "openaccessmandateforpublications": { - "type": "boolean" - }, - "startdate": { - "type": "string" - }, - "subject": { - "type": "array", - "items": { - "type": "string" - } - }, - "summary": { - "type": "string" - }, - "title": { - "type": "string" - }, - "websiteurl": { - "type": "string" - } - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/relation_schema.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/relation_schema.json deleted file mode 100644 index 7c7de9c98..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/relation_schema.json +++ /dev/null @@ -1,60 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "definitions": { - "Node": { - "type": "object", - "properties": { - "id": { - "type": "string", - "description": "The OpenAIRE id of the entity" - }, - "type": { - "type": "string", - "description": "The type of the entity (i.e. organisation)" - } - } - } - }, - "type": "object", - "properties": { - "provenance": { - "type": "object", - "properties": { - "provenance": { - "type": "string", - "description": "The reason why OpenAIRE holds the relation " - }, - "trust": { - "type": "string", - "description": "The trust of the relation in the range of [0,1]. Where greater the number, more the trust. Harvested relationships have typically a high trust (0.9). The trust of inferred relationship is calculated by the inference algorithm that generated them, as described in https://graph.openaire.eu/about#architecture (Enrichment --> Mining)" - } - } - }, - "reltype": { - "type": "object", - "properties": { - "name": { - "type": "string", - "description": "The semantics of the relation (i.e. isAuthorInstitutionOf). " - }, - "type": { - "type": "string", - "description": "the type of the relation (i.e. affiliation)" - } - }, - "description": "To represent the semantics of a relation between two entities" - }, - "source": { - "allOf": [ - {"$ref": "#/definitions/Node"}, - {"description": "The node source in the relation"} - ] - }, - "target": { - "allOf": [ - {"$ref": "#/definitions/Node"}, - {"description": "The node target in the relation"} - ] - } - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/result_schema.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/result_schema.json deleted file mode 100644 index 03cbfb074..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/schema/result_schema.json +++ /dev/null @@ -1,398 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "definitions": { - "AccessRight":{ - "type":"object", - "properties":{ - "code": { - "type": "string", - "description": "COAR access mode code: http://vocabularies.coar-repositories.org/documentation/access_rights/" - }, - "label": { - "type": "string", - "description": "Label for the access mode" - }, - "scheme": { - "type": "string", - "description": "Scheme of reference for access right code. Always set to COAR access rights vocabulary: http://vocabularies.coar-repositories.org/documentation/access_rights/" - } - } - }, - "ControlledField": { - "type": "object", - "properties": { - "scheme": { - "type": "string" - }, - "value": { - "type": "string" - } - }, - "description": "To represent the information described by a scheme and a value in that scheme (i.e. pid)" - }, - "Provenance": { - "type": "object", - "properties": { - "provenance": { - "type": "string", - "description": "The process that produced/provided the information" - }, - "trust": { - "type": "string" - } - }, - "description": "Indicates the process that produced (or provided) the information, and the trust associated to the information" - } - }, - "type": "object", - "properties": { - "author": { - "type": "array", - "items": { - "type": "object", - "properties": { - "fullname": { - "type": "string" - }, - "name": { - "type": "string" - }, - "pid": { - "type": "object", - "properties": { - "id": { - "allOf": [ - {"$ref": "#/definitions/ControlledField"}, - {"description": "The author's id and scheme. OpenAIRE currently supports 'ORCID'"} - ] - }, - "provenance": { - "allOf": [ - {"$ref": "#/definitions/Provenance"}, - {"description": "Provenance of author's pid"} - ] - } - } - }, - "rank": { - "type": "integer" - }, - "surname": { - "type": "string" - } - } - } - }, - "bestaccessright": { - "type": "object", - "properties": { - "code": { - "type": "string", - "description": "COAR access mode code: http://vocabularies.coar-repositories.org/documentation/access_rights/" - }, - "label": { - "type": "string", - "description": "Label for the access mode" - }, - "scheme": { - "type": "string", - "description": "Scheme of reference for access right code. Always set to COAR access rights vocabulary: http://vocabularies.coar-repositories.org/documentation/access_rights/" - } - }, - "description": "The openest access right associated to the manifestations of this research results" - }, - "codeRepositoryUrl": { - "type": "string", - "description": "Only for results with type 'software': the URL to the repository with the source code" - }, - "contactgroup": { - "description": "Only for results with type 'software': Information on the group responsible for providing further information regarding the resource", - "type": "array", - "items": { - "type": "string" - } - }, - "contactperson": { - "description": "Only for results with type 'software': Information on the person responsible for providing further information regarding the resource", - "type": "array", - "items": { - "type": "string" - } - }, - "container": { - "type": "object", - "properties": { - "conferencedate": { - "type": "string" - }, - "conferenceplace": { - "type": "string" - }, - "edition": { - "type": "string", - "description": "Edition of the journal or conference proceeding" - }, - "ep": { - "type": "string", - "description": "End page" - }, - "iss": { - "type": "string", - "description": "Journal issue" - }, - "issnLinking": { - "type": "string" - }, - "issnOnline": { - "type": "string" - }, - "issnPrinted": { - "type": "string" - }, - "name": { - "type": "string", - "description": "Name of the journal or conference" - }, - "sp": { - "type": "string", - "description": "start page" - }, - "vol": { - "type": "string" - } - }, - "description": "Container has information about the conference or journal where the result has been presented or published" - }, - "contributor": { - "type": "array", - "items": { - "type": "string", - "description": "Description of contributor" - } - }, - "country": { - "type": "array", - "items": { - "type": "object", - "properties": { - "code": { - "type": "string", - "description": "ISO 3166-1 alpha-2 country code" - }, - "label": { - "type": "string" - }, - "provenance": { - "allOf": [ - {"$ref": "#/definitions/Provenance"}, - {"description": "Why this result is associated to the country."} - ] - } - } - } - }, - "coverage": { - "type": "array", - "items": { - "type": "string" - } - }, - "dateofcollection": { - "type": "string", - "description": "When OpenAIRE collected the record the last time" - }, - "description": { - "type": "array", - "items": { - "type": "string" - } - }, - "documentationUrl": { - "description": "Only for results with type 'software': URL to the software documentation", - "type": "array", - "items": { - "type": "string" - } - }, - "embargoenddate": { - "type": "string", - "description": "Date when the embargo ends and this result turns Open Access" - }, - "format": { - "type": "array", - "items": { - "type": "string" - } - }, - "geolocation": { - "description": "Geolocation information", - "type": "array", - "items": { - "type": "object", - "properties": { - "box": { - "type": "string" - }, - "place": { - "type": "string" - }, - "point": { - "type": "string" - } - } - } - }, - "id": { - "type": "string", - "description": "OpenAIRE Identifier" - }, - "language": { - "type": "object", - "properties": { - "code": { - "type": "string", - "description": "alpha-3/ISO 639-2 code of the language" - }, - "label": { - "type": "string", - "description": "English label" - } - } - }, - "lastupdatetimestamp": { - "type": "integer", - "description": "Timestamp of last update of the record in OpenAIRE" - }, - "maintitle": { - "type": "string" - }, - "originalId": { - "description": "Identifiers of the record at the original sources", - "type": "array", - "items": { - "type": "string" - } - }, - "pid": { - "description": "Persistent identifiers of the result", - "type": "array", - "items": { - "allOf": [ - {"$ref": "#/definitions/ControlledField"}, - {"description": "scheme: list of available schemes are at https://api.openaire.eu/vocabularies/dnet:pid_types, value: the PID of the result "} - ] - } - }, - "instance":{ - "type":"array", - "items":{ - "type":"object", - "properties":{ - "accessright":{ - "allOf":[ - { - "$ref":"#/definitions/AccessRight" - }, - { - "description":"The accessright of this materialization of the result" - } - ] - }, - "articleprocessingcharge":{ - "type":"object", - "properties":{ - "amount":{ - "type":"string" - }, - "currency":{ - "type":"string" - } - } - }, - "license":{ - "type":"string" - }, - "publicationdate":{ - "type":"string" - }, - "refereed":{ - "type":"string" - }, - "type":{ - "type":"string", - "description":"The specific sub-type of this materialization of the result (see https://api.openaire.eu/vocabularies/dnet:result_typologies following the links)" - }, - "url":{ - "description":"Description of url", - "type":"array", - "items":{ - "type":"string", - "description":"urls where it is possible to access the materialization of the result" - } - } - }, - "description":"One of the materialization for this result" - } - }, - "programmingLanguage": { - "type": "string", - "description": "Only for results with type 'software': the programming language" - }, - "publicationdate": { - "type": "string" - }, - "publisher": { - "type": "string" - }, - "size": { - "type": "string", - "description": "Only for results with type 'dataset': the declared size of the dataset" - }, - "source": { - "description": "See definition of Dublin Core field dc:source", - "type": "array", - "items": { - "type": "string" - } - }, - "subjects": { - "description": "Keywords associated to the result", - "type": "array", - "items": { - "type": "object", - "properties": { - "provenance": { - "allOf": [ - {"$ref": "#/definitions/Provenance"}, - {"description": "Why this subject is associated to the result"} - ] - }, - "subject": { - "allOf": [ - {"$ref": "#/definitions/ControlledField"}, - {"description": "OpenAIRE subject classification scheme (https://api.openaire.eu/vocabularies/dnet:subject_classification_typologies) and value. When the scheme is 'keyword', it means that the subject is free-text (i.e. not a term from a controlled vocabulary)."} - ] - } - } - } - }, - "subtitle": { - "type": "string" - }, - "tool": { - "description": "Only for results with type 'other': tool useful for the interpretation and/or re-used of the research product", - "type": "array", - "items": { - "type": "string" - } - }, - "type": { - "type": "string", - "description": "Type of the result: one of 'publication', 'dataset', 'software', 'other' (see also https://api.openaire.eu/vocabularies/dnet:result_typologies)" - }, - "version": { - "type": "string", - "description": "Version of the result" - } - } -} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funderresults/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funderresults/oozie_app/config-default.xml deleted file mode 100644 index e5ec3d0ae..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funderresults/oozie_app/config-default.xml +++ /dev/null @@ -1,30 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - hiveMetastoreUris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - hiveJdbcUrl - jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 - - - hiveDbName - openaire - - - oozie.launcher.mapreduce.user.classpath.first - true - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funderresults/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funderresults/oozie_app/workflow.xml deleted file mode 100644 index 650b972fa..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funderresults/oozie_app/workflow.xml +++ /dev/null @@ -1,563 +0,0 @@ - - - - - upload - false - true to upload the dump for the funders in Zenodo - - - sourcePath - the source path - - - isLookUpUrl - the isLookup service endpoint - - - outputPath - the output path - - - accessToken - the access token used for the deposition in Zenodo - - - connectionUrl - the connection url for Zenodo - - - metadata - the metadata associated to the deposition - - - depositionType - the type of deposition we want to perform. "new" for brand new deposition, "version" for a new version of a published deposition (in this case the concept record id must be provided), "upload" to upload content to an open deposition for which we already have the deposition id (in this case the deposition id should be provided) - - - conceptRecordId - for new version, the id of the record for the old deposition - - - depositionId - the depositionId of a deposition open that has to be added content - - - hiveDbName - the target hive database name - - - hiveJdbcUrl - hive server jdbc url - - - hiveMetastoreUris - hive server metastore URIs - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - oozieActionShareLibForSpark2 - oozie action sharelib for spark 2.* - - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - - - spark2YarnHistoryServerAddress - spark 2.* yarn history server address - - - spark2EventLogDir - spark 2.* event log dir location - - - - - ${jobTracker} - ${nameNode} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - oozie.action.sharelib.for.spark - ${oozieActionShareLibForSpark2} - - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - eu.dnetlib.dhp.oa.graph.dump.SaveCommunityMap - --outputPath${workingDir}/communityMap - --nameNode${nameNode} - --isLookUpUrl${isLookUpUrl} - - - - - - - - - - - - - - - yarn - cluster - Dump funder results - eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkResultLinkedToProject - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/publication - --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication - --outputPath${workingDir}/result/publication - --relationPath${sourcePath}/relation - - - - - - - - yarn - cluster - Dump funder results - eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkResultLinkedToProject - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/dataset - --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset - --outputPath${workingDir}/result/dataset - --relationPath${sourcePath}/relation - - - - - - - - yarn - cluster - Dump funder results - eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkResultLinkedToProject - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/otherresearchproduct - --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath${workingDir}/result/otherresearchproduct - --relationPath${sourcePath}/relation - - - - - - - - yarn - cluster - Dump funder results - eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkResultLinkedToProject - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath}/software - --resultTableNameeu.dnetlib.dhp.schema.oaf.Software - --outputPath${workingDir}/result/software - --relationPath${sourcePath}/relation - - - - - - - - - - - - - - - - - yarn - cluster - Dump table publication for community related products - eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/result/publication - --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication - --outputPath${workingDir}/dump/publication - --communityMapPath${workingDir}/communityMap - --dumpTypefunder - - - - - - - - yarn - cluster - Dump table dataset for community related products - eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/result/dataset - --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset - --outputPath${workingDir}/dump/dataset - --communityMapPath${workingDir}/communityMap - --dumpTypefunder - - - - - - - - yarn - cluster - Dump table ORP for community related products - eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/result/otherresearchproduct - --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath${workingDir}/dump/otherresearchproduct - --communityMapPath${workingDir}/communityMap - --dumpTypefunder - - - - - - - - yarn - cluster - Dump table software for community related products - eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/result/software - --resultTableNameeu.dnetlib.dhp.schema.oaf.Software - --outputPath${workingDir}/dump/software - --communityMapPath${workingDir}/communityMap - --dumpTypefunder - - - - - - - - - - yarn - cluster - Prepare association result subset of project info - eu.dnetlib.dhp.oa.graph.dump.community.SparkPrepareResultProject - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath} - --outputPath${workingDir}/preparedInfo - - - - - - - - - - - - - - - yarn - cluster - Extend dumped publications with information about project - eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/dump/publication - --outputPath${workingDir}/ext/publication - --preparedInfoPath${workingDir}/preparedInfo - - - - - - - - yarn - cluster - Extend dumped dataset with information about project - eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/dump/dataset - --outputPath${workingDir}/ext/dataset - --preparedInfoPath${workingDir}/preparedInfo - - - - - - - - yarn - cluster - Extend dumped ORP with information about project - eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/dump/otherresearchproduct - --outputPath${workingDir}/ext/orp - --preparedInfoPath${workingDir}/preparedInfo - - - - - - - - yarn - cluster - Extend dumped software with information about project - eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/dump/software - --outputPath${workingDir}/ext/software - --preparedInfoPath${workingDir}/preparedInfo - - - - - - - - - yarn - cluster - Dump funder results - eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkDumpFunderResults - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/ext - --outputPath${workingDir}/resultperfunder - --relationPath${sourcePath} - - - - - - - - eu.dnetlib.dhp.oa.graph.dump.MakeTar - --hdfsPath${outputPath} - --nameNode${nameNode} - --sourcePath${workingDir}/resultperfunder - - - - - - - - ${wf:conf('upload') eq true} - - - - - - - eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS - --hdfsPath${outputPath} - --nameNode${nameNode} - --accessToken${accessToken} - --connectionUrl${connectionUrl} - --metadata${metadata} - --communityMapPath${workingDir}/communityMap - --conceptRecordId${conceptRecordId} - --depositionType${depositionType} - --depositionId${depositionId} - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/input_collect_and_save.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_collect_and_save.json similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/input_collect_and_save.json rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_collect_and_save.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/input_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_complete_parameters.json similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/input_parameters.json rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_complete_parameters.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/input_entity_parameter.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_entity_parameter.json similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/input_entity_parameter.json rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_entity_parameter.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/input_organization_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_organization_parameters.json similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/input_organization_parameters.json rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_organization_parameters.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/input_relationdump_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_relationdump_parameters.json similarity index 73% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/input_relationdump_parameters.json rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_relationdump_parameters.json index 2bfcac3bc..5c26ea7d1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/complete/input_relationdump_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_relationdump_parameters.json @@ -19,6 +19,12 @@ "paramLongName": "isSparkSessionManaged", "paramDescription": "true if the spark session is managed, false otherwise", "paramRequired": false + }, + { + "paramName": "rs", + "paramLongName": "removeSet", + "paramDescription": "the list of classname relations, split by ';', not to be dumped", + "paramRequired": false } ] diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/community_infrastructure_schema.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/community_infrastructure_schema.json deleted file mode 100644 index 727432a67..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/community_infrastructure_schema.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "properties": { - "acronym": { - "type": "string", - "description": "The acronym of the community" - }, - "description": { - "type": "string", - "description": "Description of the research community/research infrastructure" - }, - "id": { - "type": "string", - "description": "OpenAIRE id of the research community/research infrastructure" - }, - "name": { - "type": "string", - "description": "The long name of the community" - }, - "subject": { - "description": "Only for research communities: the list of the subjects associated to the research community", - "type": "array", - "items": {"type": "string"} - }, - "type": { - "type": "string", - "description": "One of {Research Community, Research infrastructure}" - }, - "zenodo_community": { - "type": "string", - "description": "The URL of the Zenodo community associated to the Research community/Research infrastructure" - } - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/datasource_schema.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/datasource_schema.json deleted file mode 100644 index c416f2320..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/datasource_schema.json +++ /dev/null @@ -1,192 +0,0 @@ -{ - "$schema":"http://json-schema.org/draft-07/schema#", - "definitions": { - "ControlledField": { - "type": "object", - "properties": { - "scheme": { - "type": "string" - }, - "value": { - "type": "string" - } - }, - "description": "To represent the information described by a scheme and a value in that scheme (i.e. pid)" - } - }, - "type":"object", - "properties": { - "accessrights": { - "type": "string", - "description": "Type of access to the data source, as defined by re3data.org. Possible values: {open, restricted, closed}" - }, - "certificates": { - "type": "string", - "description": "The certificate, seal or standard the data source complies with. As defined by re3data.org." - }, - "citationguidelineurl": { - "type": "string", - "description":"The URL of the data source providing information on how to cite its items. As defined by re3data.org." - }, - "contenttypes": { - "description": "Types of content in the data source, as defined by OpenDOAR", - "type": "array", - "items": { - "type": "string" - } - }, - "databaseaccessrestriction": { - "type": "string", - "description": "Access restrinctions to the data source, as defined by re3data.org. One of {feeRequired, registration, other}" - }, - "datasourcetype": { - "allOf": [ - { - "$ref": "#/definitions/ControlledField" - }, - { - "description": "The type of the datasource. See https://api.openaire.eu/vocabularies/dnet:datasource_typologies" - } - ] - }, - "datauploadrestriction": { - "type": "string", - "description": "Upload restrictions applied by the datasource, as defined by re3data.org. One of {feeRequired, registration, other}" - }, - "dateofvalidation": { - "type": "string", - "description": "The date of last validation against the OpenAIRE guidelines for the datasource records" - }, - "description": { - "type": "string" - }, - "englishname": { - "type": "string", - "description": "The English name of the datasource" - }, - "id": { - "type": "string", - "description": "The OpenAIRE id of the data source" - }, - "journal": { - "type": "object", - "properties": { - "conferencedate": { - "type": "string" - }, - "conferenceplace": { - "type": "string" - }, - "edition": { - "type": "string" - }, - "ep": { - "type": "string", - "description": "End page" - }, - "iss": { - "type": "string", - "description": "Issue number" - }, - "issnLinking": { - "type": "string" - }, - "issnOnline": { - "type": "string" - }, - "issnPrinted": { - "type": "string" - }, - "name": { - "type": "string" - }, - "sp": { - "type": "string", - "description": "Start page" - }, - "vol": { - "type": "string", - "description": "Volume" - } - }, - "description": "Information about the journal, if this data source is of type Journal." - }, - "languages": { - "description": "The languages present in the data source's content, as defined by OpenDOAR.", - "type": "array", - "items": { - "type": "string" - } - }, - "logourl": { - "type": "string" - }, - "missionstatementurl": { - "type": "string", - "description":"The URL of a mission statement describing the designated community of the data source. As defined by re3data.org" - }, - "officialname": { - "type": "string", - "description": "The official name of the datasource" - }, - "openairecompatibility": { - "type": "string", - "description": "OpenAIRE guidelines the data source comply with. See also https://guidelines.openaire.eu." - }, - "originalId": { - "description": "Original identifiers for the datasource" - "type": "array", - "items": { - "type": "string" - } - }, - "pid": { - "description": "Persistent identifiers of the datasource", - "type": "array", - "items": { - "allOf": [ - { - "$ref": "#/definitions/ControlledField" - } - ] - } - }, - "pidsystems": { - "type": "string", - "description": "The persistent identifier system that is used by the data source. As defined by re3data.org" - }, - "policies": { - "description": "Policies of the data source, as defined in OpenDOAR.", - "type": "array", - "items": { - "type": "string" - } - }, - "releaseenddate": { - "type": "string", - "description": "Date when the data source went offline or stopped ingesting new research data. As defined by re3data.org" - }, - "releasestartdate": { - "type": "string", - "description": "Releasing date of the data source, as defined by re3data.org" - }, - "subjects": { - "description": "List of subjects associated to the datasource", - "type": "array", - "items": { - "type": "string" - } - }, - "uploadrights": { - "type": "string", - "description": "Type of data upload. As defined by re3data.org: one of {open, restricted,closed}" - }, - "versioning": { - "type": "boolean", - "description": "As defined by redata.org: 'yes' if the data source supports versioning, 'no' otherwise." - }, - "websiteurl": { - "type": "string" - } - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/organization_schema.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/organization_schema.json deleted file mode 100644 index 16afa386d..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/organization_schema.json +++ /dev/null @@ -1,57 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "properties": { - "alternativenames": { - "description": "Alternative names that identify the organisation", - "type": "array", - "items": { - "type": "string" - } - }, - "country": { - "type": "object", - "properties": { - "code": { - "type": "string", - "description": "The organisation country code" - }, - "label": { - "type": "string", - "description": "The organisation country label" - } - }, - "description": "The country of the organisation" - }, - "id": { - "type": "string", - "description": "The OpenAIRE id for the organisation" - }, - "legalname": { - "type": "string" - }, - "legalshortname": { - "type": "string" - }, - "pid": { - "description": "Persistent identifiers for the organisation i.e. isni 0000000090326370", - "type": "array", - "items": { - "type": "object", - "properties": { - "scheme": { - "type": "string", - "description": "The scheme of the identifier (i.e. isni)" - }, - "value": { - "type": "string", - "description": "the value in the schema (i.e. 0000000090326370)" - } - } - } - }, - "websiteurl": { - "type": "string" - } - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/project_schema.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/project_schema.json deleted file mode 100644 index c81187258..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/project_schema.json +++ /dev/null @@ -1,119 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "properties": { - "acronym": { - "type": "string" - }, - "callidentifier": { - "type": "string" - }, - "code": { - "type": "string", - "description": "The grant agreement number" - }, - "enddate": { - "type": "string" - }, - "funding": { - "description": "Funding information for the project", - "type": "array", - "items": { - "type": "object", - "properties": { - "funding_stream": { - "type": "object", - "properties": { - "description": { - "type": "string", - "description": "Description of the funding stream" - }, - "id": { - "type": "string", - "description": "Id of the funding stream" - } - } - }, - "jurisdiction": { - "type": "string", - "description": "The jurisdiction of the funder (i.e. EU)" - }, - "name": { - "type": "string", - "description": "The name of the funder (European Commission)" - }, - "shortName": { - "type": "string", - "description": "The short name of the funder (EC)" - } - } - } - }, - "granted": { - "type": "object", - "properties": { - "currency": { - "type": "string", - "description": "The currency of the granted amount (e.g. EUR)" - }, - "fundedamount": { - "type": "number", - "description": "The funded amount" - }, - "totalcost": { - "type": "number", - "description": "The total cost of the project" - } - }, - "description": "The money granted to the project" - }, - "h2020programme": { - "description": "The h2020 programme funding the project", - "type": "array", - "items": { - "type": "object", - "properties": { - "code": { - "type": "string", - "description": "The code of the programme" - }, - "description": { - "type": "string", - "description": "The description of the programme" - } - } - } - }, - "id": { - "type": "string", - "description": "OpenAIRE id for the project" - }, - "keywords": { - "type": "string" - }, - "openaccessmandatefordataset": { - "type": "boolean" - }, - "openaccessmandateforpublications": { - "type": "boolean" - }, - "startdate": { - "type": "string" - }, - "subject": { - "type": "array", - "items": { - "type": "string" - } - }, - "summary": { - "type": "string" - }, - "title": { - "type": "string" - }, - "websiteurl": { - "type": "string" - } - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/relation_schema.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/relation_schema.json deleted file mode 100644 index 98134a03b..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/relation_schema.json +++ /dev/null @@ -1,68 +0,0 @@ -{ - "$schema":"http://json-schema.org/draft-07/schema#", - "definitions": { - "Node": { - "type": "object", - "properties": { - "id": { - "type": "string", - "description": "The OpenAIRE id of the entity" - }, - "type": { - "type": "string", - "description": "The type of the entity (i.e. organisation)" - } - } - } - }, - "type":"object", - "properties": { - "provenance": { - "type": "object", - "properties": { - "provenance": { - "type": "string", - "description": "The reason why OpenAIRE holds the relation " - }, - "trust": { - "type": "string", - "description": "The trust of the relation in the range of [0,1]. Where greater the number, more the trust. Harvested relationships have typically a high trust (0.9). The trust of inferred relationship is calculated by the inference algorithm that generated them, as described in https://graph.openaire.eu/about#architecture (Enrichment --> Mining)" - } - } - }, - "reltype": { - "type": "object", - "properties": { - "name": { - "type": "string", - "description": "The semantics of the relation (i.e. isAuthorInstitutionOf). " - }, - "type": { - "type": "string", - "description": "the type of the relation (i.e. affiliation)" - } - }, - "description": "To represent the semantics of a relation between two entities" - }, - "source": { - "allOf": [ - {"$ref": "#/definitions/Node"}, - {"description": "The node source in the relation"} - ] - }, - "target": { - "allOf": [ - {"$ref": "#/definitions/Node"}, - {"description": "The node target in the relation"} - ] - }, - "validated":{ - "type":"boolean", - "description":"True if the relation is related to a project and it has been collected from an authoritative source (i.e. the funder)" - }, - "validationDate":{ - "type":"string", - "description":"The date when the relation was collected from OpenAIRE" - } - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/result_schema.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/result_schema.json deleted file mode 100644 index a1e09525e..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/schemas/result_schema.json +++ /dev/null @@ -1,417 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "definitions": { - "ControlledField": { - "type": "object", - "properties": { - "scheme": { - "type": "string" - }, - "value": { - "type": "string" - } - }, - "description": "To represent the information described by a scheme and a value in that scheme (i.e. pid)" - }, - "Provenance": { - "type": "object", - "properties": { - "provenance": { - "type": "string", - "description": "The process that produced/provided the information" - }, - "trust": { - "type": "string" - } - }, - "description": "Indicates the process that produced (or provided) the information, and the trust associated to the information" - } - }, - "type": "object", - "properties": { - "author": { - "type": "array", - "items": { - "type": "object", - "properties": { - "fullname": { - "type": "string" - }, - "name": { - "type": "string" - }, - "pid": { - "type": "object", - "properties": { - "id": { - "allOf": [ - {"$ref": "#/definitions/ControlledField"}, - {"description": "The author's id and scheme. OpenAIRE currently supports 'ORCID'"} - ] - }, - "provenance": { - "allOf": [ - {"$ref": "#/definitions/Provenance"}, - {"description": "Provenance of author's pid"} - ] - } - } - }, - "rank": { - "type": "integer" - }, - "surname": { - "type": "string" - } - } - } - }, - "bestaccessright":{ - "type":"object", - "properties":{ - "code": { - "type": "string", - "description": "COAR access mode code: http://vocabularies.coar-repositories.org/documentation/access_rights/" - }, - "label": { - "type": "string", - "description": "Label for the access mode" - }, - "scheme": { - "type": "string", - "description": "Scheme of reference for access right code. Always set to COAR access rights vocabulary: http://vocabularies.coar-repositories.org/documentation/access_rights/" - } - } - }, - "codeRepositoryUrl": { - "type": "string", - "description": "Only for results with type 'software': the URL to the repository with the source code" - }, - "contactgroup": { - "description": "Only for results with type 'software': Information on the group responsible for providing further information regarding the resource", - "type": "array", - "items": { - "type": "string" - } - }, - "contactperson": { - "description": "Only for results with type 'software': Information on the person responsible for providing further information regarding the resource", - "type": "array", - "items": { - "type": "string" - } - }, - "container": { - "type": "object", - "properties": { - "conferencedate": { - "type": "string" - }, - "conferenceplace": { - "type": "string" - }, - "edition": { - "type": "string", - "description": "Edition of the journal or conference proceeding" - }, - "ep": { - "type": "string", - "description": "End page" - }, - "iss": { - "type": "string", - "description": "Journal issue" - }, - "issnLinking": { - "type": "string" - }, - "issnOnline": { - "type": "string" - }, - "issnPrinted": { - "type": "string" - }, - "name": { - "type": "string", - "description": "Name of the journal or conference" - }, - "sp": { - "type": "string", - "description": "start page" - }, - "vol": { - "type": "string" - } - }, - "description": "Container has information about the conference or journal where the result has been presented or published" - }, - "contributor": { - "type": "array", - "items": { - "type": "string", - "description": "Contributors for the result" - } - }, - "country": { - "type": "array", - "items": { - "type": "object", - "properties": { - "code": { - "type": "string", - "description": "ISO 3166-1 alpha-2 country code" - }, - "label": { - "type": "string" - }, - "provenance": { - "allOf": [ - {"$ref": "#/definitions/Provenance"}, - {"description": "Why this result is associated to the country."} - ] - } - } - } - }, - "coverage": { - "type": "array", - "items": { - "type": "string" - } - }, - "dateofcollection": { - "type": "string", - "description": "When OpenAIRE collected the record the last time" - }, - "description": { - "type": "array", - "items": { - "type": "string" - } - }, - "documentationUrl": { - "description": "Only for results with type 'software': URL to the software documentation", - "type": "array", - "items": { - "type": "string" - } - }, - "embargoenddate": { - "type": "string", - "description": "Date when the embargo ends and this result turns Open Access" - }, - "format": { - "type": "array", - "items": { - "type": "string" - } - }, - "geolocation": { - "description": "Geolocation information", - "type": "array", - "items": { - "type": "object", - "properties": { - "box": { - "type": "string" - }, - "place": { - "type": "string" - }, - "point": { - "type": "string" - } - } - } - }, - "id": { - "type": "string", - "description": "OpenAIRE Identifier" - }, - "instance":{ - "description":"Each instance is one specific materialisation or version of the result. For example, you can have one result with three instance: one is the pre-print, one is the post-print, one is te published version", - "type":"array", - "items":{ - "type":"object", - "properties":{ - "accessright":{ - "type":"object", - "properties":{ - "code": { - "type": "string", - "description": "COAR access mode code: http://vocabularies.coar-repositories.org/documentation/access_rights/" - }, - "label": { - "type": "string", - "description": "Label for the access mode" - }, - "openAccessRoute":{ - "type":"string", - "enum":[ - "gold", - "green", - "hybrid", - "bronze" - ], - "description":"The type of OpenAccess applied to the result" - }, - "scheme": { - "type": "string", - "description": "Scheme of reference for access right code. Always set to COAR access rights vocabulary: http://vocabularies.coar-repositories.org/documentation/access_rights/" - } - } - }, - "articleprocessingcharge":{ - "description": "The money spent to make this book or article available in Open Access. Source for this information is the OpenAPC initiative.", - "type":"object", - "properties":{ - "amount":{ - "type":"string" - }, - "currency":{ - "type":"string" - } - } - }, - "license":{ - "type":"string" - }, - "pid":{ - "description":"The set of persistent identifiers associated to this instance that have been collected from an authority for the pid type (i.e. Crossref/Datacite for doi)", - "type":"array", - "items":{ - "allOf":[ - { - "$ref":"#/definitions/ControlledField" - }, - { - "description":"The persistent identifier associated to the result" - } - ] - } - }, - "publicationdate":{ - "type":"string", - "description": "Date of the research product" - }, - "refereed":{ - "description": "If this instance has been peer-reviewed or not. Allowed values are peerReviewed, nonPeerReviewed, UNKNOWN (as defined in https://api.openaire.eu/vocabularies/dnet:review_levels)", - "type":"string" - }, - "type":{ - "type":"string", - "description":"The specific sub-type of this instance (see https://api.openaire.eu/vocabularies/dnet:result_typologies following the links)" - }, - "url":{ - "description":"URLs to the instance. They may link to the actual full-text or to the landing page at the hosting source. ", - "type":"array", - "items":{ - "type":"string" - } - } - } - } - }, - "language": { - "type": "object", - "properties": { - "code": { - "type": "string", - "description": "alpha-3/ISO 639-2 code of the language" - }, - "label": { - "type": "string", - "description": "Language label in English" - } - } - }, - "lastupdatetimestamp": { - "type": "integer", - "description": "Timestamp of last update of the record in OpenAIRE" - }, - "maintitle": { - "type": "string", - "descriptio": "A name or title by which a scientific result is known. May be the title of a publication, of a dataset or the name of a piece of software." - }, - "subtitle": { - "type": "string", - "descriptio": "Explanatory or alternative name by which a scientific result is known." - }, - "originalId": { - "description": "Identifiers of the record at the original sources", - "type": "array", - "items": { - "type": "string" - } - }, - "pid": { - "description": "Persistent identifiers of the result", - "type": "array", - "items": { - "allOf": [ - {"$ref": "#/definitions/ControlledField"}, - {"description": "scheme: list of available schemes are at https://api.openaire.eu/vocabularies/dnet:pid_types, value: the PID of the result. Note: the result will have a pid associated only if it was collected from an authority for that pid type. For example a doi will be among the pids for one result if the result metadata were collected from Crossref or Datacite. In all the other cases, the doi will be present among the alteranteIdentifiers for the result "} - ] - } - }, - "programmingLanguage": { - "type": "string", - "description": "Only for results with type 'software': the programming language" - }, - "publicationdate": { - "type": "string", - "description": "Main date of the research product: typically the publication or issued date. In case of a research result with different versions with different dates, the date of the result is selected as the most frequent well-formatted date. If not available, then the most recent and complete date among those that are well-formatted. For statistics, the year is extracted and the result is counted only among the result of that year. Example: Pre-print date: 2019-02-03, Article date provided by repository: 2020-02, Article date provided by Crossref: 2020, OpenAIRE will set as date 2019-02-03, because it’s the most recent among the complete and well-formed dates. If then the repository updates the metadata and set a complete date (e.g. 2020-02-12), then this will be the new date for the result because it becomes the most recent most complete date. However, if OpenAIRE then collects the pre-print from another repository with date 2019-02-03, then this will be the “winning date” because it becomes the most frequent well-formatted date." - }, - "publisher": { - "type": "string", - "description": "The name of the entity that holds, archives, publishes prints, distributes, releases, issues, or produces the resource." - }, - "size": { - "type": "string", - "description": "Only for results with type 'dataset': the declared size of the dataset" - }, - "source": { - "description": "See definition of Dublin Core field dc:source", - "type": "array", - "items": { - "type": "string" - } - }, - "subjects": { - "description": "Keywords associated to the result", - "type": "array", - "items": { - "type": "object", - "properties": { - "provenance": { - "allOf": [ - {"$ref": "#/definitions/Provenance"}, - {"description": "Why this subject is associated to the result"} - ] - }, - "subject": { - "allOf": [ - {"$ref": "#/definitions/ControlledField"}, - {"description": "OpenAIRE subject classification scheme (https://api.openaire.eu/vocabularies/dnet:subject_classification_typologies) and value. When the scheme is 'keyword', it means that the subject is free-text (i.e. not a term from a controlled vocabulary)."} - ] - } - } - } - }, - "tool": { - "description": "Only for results with type 'other': tool useful for the interpretation and/or re-used of the research product", - "type": "array", - "items": { - "type": "string" - } - }, - "type": { - "type": "string", - "description": "Type of the result: one of 'publication', 'dataset', 'software', 'other' (see also https://api.openaire.eu/vocabularies/dnet:result_typologies)" - }, - "version": { - "type": "string", - "description": "Version of the result" - } - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/complete/DumpRelationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/complete/DumpRelationTest.java index fe178795d..3dbf2b09d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/complete/DumpRelationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/complete/DumpRelationTest.java @@ -4,8 +4,10 @@ package eu.dnetlib.dhp.oa.graph.dump.complete; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; + import java.util.HashMap; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -81,7 +83,6 @@ public class DumpRelationTest { "-sourcePath", sourcePath }); -// dumpCommunityProducts.exec(MOCK_IS_LOOK_UP_URL,Boolean.FALSE, workingDir.toString()+"/dataset",sourcePath,"eu.dnetlib.dhp.schema.oaf.Dataset","eu.dnetlib.dhp.schema.dump.oaf.Dataset"); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); @@ -144,7 +145,6 @@ public class DumpRelationTest { "-sourcePath", sourcePath }); -// dumpCommunityProducts.exec(MOCK_IS_LOOK_UP_URL,Boolean.FALSE, workingDir.toString()+"/dataset",sourcePath,"eu.dnetlib.dhp.schema.oaf.Dataset","eu.dnetlib.dhp.schema.dump.oaf.Dataset"); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); @@ -203,4 +203,61 @@ public class DumpRelationTest { "and validationDate = '2021-08-06'") .count()); } + + @Test + public void test3() throws Exception {// + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/relation/relation") + .getPath(); + + SparkDumpRelationJob.main(new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-outputPath", workingDir.toString() + "/relation", + "-sourcePath", sourcePath, + "-removeSet", "isParticipant" + }); + + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Relation.class)); + + verificationDataset.createOrReplaceTempView("table"); + + verificationDataset + .foreach((ForeachFunction) r -> System.out.println(new ObjectMapper().writeValueAsString(r))); + + Dataset check = spark + .sql( + "SELECT reltype.name, source.id source, source.type stype, target.id target,target.type ttype, provenance.provenance " + + + "from table "); + + Assertions.assertEquals(22, check.filter("name = 'isProvidedBy'").count()); + Assertions + .assertEquals( + 22, check + .filter( + "name = 'isProvidedBy' and stype = 'datasource' and ttype = 'organization' and " + + "provenance = 'Harvested'") + .count()); + + Assertions.assertEquals(0, check.filter("name = 'isParticipant'").count()); + + + Assertions.assertEquals(1, check.filter("name = 'isAuthorInstitutionOf'").count()); + Assertions + .assertEquals( + 1, check + .filter( + "name = 'isAuthorInstitutionOf' and stype = 'organization' and ttype = 'result' " + + "and provenance = 'Inferred by OpenAIRE'") + .count()); + } + }