dump of the results related to at least one project #61

Merged
claudio.atzori merged 51 commits from miriam.baglioni/dnet-hadoop:dump into master 2020-12-09 17:22:57 +01:00
3 changed files with 6 additions and 13 deletions
Showing only changes of commit 0a9db67eec - Show all commits

View File

@ -8,7 +8,10 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
@ -18,6 +21,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import scala.Tuple2;
public class SparkResultLinkedToProject implements Serializable {
@ -74,6 +78,7 @@ public class SparkResultLinkedToProject implements Serializable {
.joinWith(
results, relations.col("target").equalTo(results.col("id")),
"inner")
.map((MapFunction<Tuple2<Relation, R>, R>) t2 -> t2._2(), Encoders.bean(inputClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")

View File

@ -17,12 +17,7 @@
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName": "cmp",
"paramLongName": "communityMapPath",
"paramDescription": "the community map path",
"paramRequired": true
},{
{
"paramName": "rp",
"paramLongName": "relationPath",
"paramDescription": "the relationPath",

View File

@ -1,11 +1,4 @@
[
{
"paramName":"cmp",
"paramLongName":"communityMapPath",
"paramDescription": "the path to the serialization of the community map",
"paramRequired": true
},
{
"paramName":"s",
"paramLongName":"sourcePath",