diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/FunderResults.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/FunderResults.java new file mode 100644 index 0000000000..18884a327a --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/FunderResults.java @@ -0,0 +1,19 @@ + +package eu.dnetlib.dhp.oa.graph.dump.funderresults; + +import java.io.Serializable; +import java.util.List; + +import eu.dnetlib.dhp.schema.dump.oaf.Result; + +public class FunderResults implements Serializable { + private List results; + + public List getResults() { + return results; + } + + public void setResults(List results) { + this.results = results; + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkPrepareResultProject.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkPrepareResultProject.java new file mode 100644 index 0000000000..179c449f68 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkPrepareResultProject.java @@ -0,0 +1,165 @@ + +package eu.dnetlib.dhp.oa.graph.dump.funderresults; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.*; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.api.java.function.MapGroupsFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.graph.dump.ResultMapper; +import eu.dnetlib.dhp.oa.graph.dump.Utils; +import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap; +import eu.dnetlib.dhp.schema.dump.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Relation; +import scala.Tuple2; + +/** + * Preparation of the Project information to be added to the dumped results. For each result associated to at least one + * Project, a serialization of an instance af ResultProject closs is done. ResultProject contains the resultId, and the + * list of Projects (as in eu.dnetlib.dhp.schema.dump.oaf.community.Project) it is associated to + */ +public class SparkPrepareResultProject implements Serializable { + private static final Logger log = LoggerFactory.getLogger(SparkPrepareResultProject.class); + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkPrepareResultProject.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dump/project_prep_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + final String communityMapPath = parser.get("communityMapPath"); + log.info("communityMapPath: {}", communityMapPath); + + SparkConf conf = new SparkConf(); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + Utils.removeOutputDir(spark, outputPath); + prepareResultProjectList2(spark, inputPath, outputPath, communityMapPath); + }); + } + + private static void prepareResultProjectList(SparkSession spark, String inputPath, String outputPath, + String communityMapPath) { + + CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath); + + Dataset relation = Utils + .readPath(spark, inputPath + "/relation", Relation.class) + .filter("dataInfo.deletedbyinference = false and relClass = 'produces'"); + + Dataset result = Utils + .readPath(spark, inputPath + "/publication", eu.dnetlib.dhp.schema.oaf.Result.class) + .union(Utils.readPath(spark, inputPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Result.class)) + .union(Utils.readPath(spark, inputPath + "/otherresearchproduct", eu.dnetlib.dhp.schema.oaf.Result.class)) + .union(Utils.readPath(spark, inputPath + "/software", eu.dnetlib.dhp.schema.oaf.Result.class)); + + result + .joinWith(relation, result.col("id").equalTo(relation.col("target"))) + .groupByKey( + (MapFunction, String>) value -> value + ._2() + .getSource() + .substring(3, 15), + Encoders.STRING()) + .mapGroups( + (MapGroupsFunction, Tuple2>) ( + s, it) -> { + Tuple2 first = it.next(); + FunderResults fr = new FunderResults(); + List resultList = new ArrayList<>(); + resultList.add(ResultMapper.map(first._1(), communityMap, true)); + it.forEachRemaining(c -> { + resultList.add(ResultMapper.map(c._1(), communityMap, true)); + + }); + fr.setResults(resultList); + return new Tuple2<>(s, fr); + }, Encoders.tuple(Encoders.STRING(), Encoders.bean(FunderResults.class))) + .foreach(t -> { + String funder = t._1(); + spark + .createDataFrame(t._2.getResults(), Result.class) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath + "/" + funder); + }); + + } + + private static void prepareResultProjectList2(SparkSession spark, String inputPath, String outputPath, + String communityMapPath) { + + CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath); + + Dataset relation = Utils + .readPath(spark, inputPath + "/relation", Relation.class) + .filter("dataInfo.deletedbyinference = false and relClass = 'produces'"); + + Dataset result = Utils + .readPath(spark, inputPath + "/publication", eu.dnetlib.dhp.schema.oaf.Result.class) + .union(Utils.readPath(spark, inputPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Result.class)) + .union(Utils.readPath(spark, inputPath + "/otherresearchproduct", eu.dnetlib.dhp.schema.oaf.Result.class)) + .union(Utils.readPath(spark, inputPath + "/software", eu.dnetlib.dhp.schema.oaf.Result.class)); + + result + .joinWith(relation, result.col("id").equalTo(relation.col("target"))) + .groupByKey( + (MapFunction, String>) value -> value + ._2() + .getSource() + .substring(3, 15), + Encoders.STRING()) + .mapGroups( + (MapGroupsFunction, String>) (s, it) -> { + Tuple2 first = it.next(); + List resultList = new ArrayList<>(); + resultList.add(ResultMapper.map(first._1(), communityMap, true)); + it.forEachRemaining(c -> { + resultList.add(ResultMapper.map(c._1(), communityMap, true)); + + }); + spark + .createDataFrame(resultList, Result.class) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath + "/" + s); + + return new String(); + }, Encoders.STRING()); + + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funder_result_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funder_result_parameters.json new file mode 100644 index 0000000000..f5eb819838 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funder_result_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + }, + { + "paramName": "cmp", + "paramLongName": "communityMapPath", + "paramDescription": "the community map path", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funderresults/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funderresults/oozie_app/config-default.xml new file mode 100644 index 0000000000..e5ec3d0aee --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funderresults/oozie_app/config-default.xml @@ -0,0 +1,30 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 + + + hiveDbName + openaire + + + oozie.launcher.mapreduce.user.classpath.first + true + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funderresults/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funderresults/oozie_app/workflow.xml new file mode 100644 index 0000000000..5ddab3398e --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/funderresults/oozie_app/workflow.xml @@ -0,0 +1,179 @@ + + + + + sourcePath + the source path + + + isLookUpUrl + the isLookup service endpoint + + + outputPath + the output path + + + accessToken + the access token used for the deposition in Zenodo + + + connectionUrl + the connection url for Zenodo + + + metadata + the metadata associated to the deposition + + + depositionType + the type of deposition we want to perform. "new" for brand new deposition, "version" for a new version of a published deposition (in this case the concept record id must be provided), "upload" to upload content to an open deposition for which we already have the deposition id (in this case the deposition id should be provided) + + + conceptRecordId + for new version, the id of the record for the old deposition + + + depositionId + the depositionId of a deposition open that has to be added content + + + hiveDbName + the target hive database name + + + hiveJdbcUrl + hive server jdbc url + + + hiveMetastoreUris + hive server metastore URIs + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + eu.dnetlib.dhp.oa.graph.dump.SaveCommunityMap + --outputPath${workingDir}/communityMap + --nameNode${nameNode} + --isLookUpUrl${isLookUpUrl} + + + + + + + + + yarn + cluster + Dump funder results + eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkPrepareResultProject + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${sourcePath} + --outputPath${workingDir}/result + --communityMapPath${workingDir}/communityMap + + + + + + + + eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS + --hdfsPath${outputPath} + --nameNode${nameNode} + --accessToken${accessToken} + --connectionUrl${connectionUrl} + --metadata${metadata} + --communityMapPath${workingDir}/communityMap + --conceptRecordId${conceptRecordId} + --depositionType${depositionType} + --depositionId${depositionId} + + + + + + + + \ No newline at end of file