[Dump Funders] -
This commit is contained in:
parent
13d1d73b2e
commit
9ba598a9b5
|
@ -84,6 +84,15 @@ public class SparkDumpFunderResults implements Serializable {
|
|||
, Encoders.STRING())
|
||||
.distinct();
|
||||
|
||||
Dataset<CommunityResult> pubs;
|
||||
Dataset<CommunityResult> result ;
|
||||
pubs = Utils
|
||||
.readPath(spark, inputPath + "/publication", CommunityResult.class);
|
||||
Dataset<CommunityResult> dats = Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class);
|
||||
Dataset<CommunityResult> orp = Utils.readPath(spark, inputPath + "/otherresearchproduct", CommunityResult.class);
|
||||
Dataset<CommunityResult> sw = Utils.readPath(spark, inputPath + "/software", CommunityResult.class);
|
||||
result = pubs.union(dats).union(orp).union(sw);
|
||||
|
||||
funderList.foreach((ForeachFunction<String>) funder ->
|
||||
getFunderResult(funder, inputPath, spark)
|
||||
.write()
|
||||
|
@ -99,12 +108,15 @@ public class SparkDumpFunderResults implements Serializable {
|
|||
|
||||
@Nullable
|
||||
private static Dataset<CommunityResult> getFunderResult(String funderName, String inputPath, SparkSession spark) {
|
||||
Dataset<CommunityResult> result = Utils
|
||||
.readPath(spark, inputPath + "/publication", CommunityResult.class)
|
||||
.union(Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class))
|
||||
.union(Utils.readPath(spark, inputPath + "/otherresearchproduct", CommunityResult.class))
|
||||
.union(Utils.readPath(spark, inputPath + "/software", CommunityResult.class));
|
||||
return result.map((MapFunction<CommunityResult, CommunityResult>) cr -> {
|
||||
Dataset<CommunityResult> pubs;
|
||||
Dataset<CommunityResult> result ;
|
||||
pubs = Utils
|
||||
.readPath(spark, inputPath + "/publication", CommunityResult.class);
|
||||
Dataset<CommunityResult> dats = Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class);
|
||||
Dataset<CommunityResult> orp = Utils.readPath(spark, inputPath + "/otherresearchproduct", CommunityResult.class);
|
||||
Dataset<CommunityResult> sw = Utils.readPath(spark, inputPath + "/software", CommunityResult.class);
|
||||
result = pubs.union(dats).union(orp).union(sw);
|
||||
Dataset<CommunityResult> tmp = result.map((MapFunction<CommunityResult, CommunityResult>) cr -> {
|
||||
if (!Optional.ofNullable(cr.getProjects()).isPresent()) {
|
||||
return null;
|
||||
}
|
||||
|
@ -116,6 +128,8 @@ public class SparkDumpFunderResults implements Serializable {
|
|||
return null;
|
||||
}, Encoders.bean(CommunityResult.class))
|
||||
.filter(Objects::nonNull);
|
||||
System.out.println(tmp.count());
|
||||
return tmp;
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -81,15 +81,15 @@ public class SplitPerFunderTest {
|
|||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
// FP7 3
|
||||
// FP7 3 and H2020 3
|
||||
JavaRDD<CommunityResult> tmp = sc
|
||||
.textFile(workingDir.toString() + "/split/EC_FP7")
|
||||
.textFile(workingDir.toString() + "/split/EC")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, CommunityResult.class));
|
||||
|
||||
org.apache.spark.sql.Dataset<CommunityResult> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(CommunityResult.class));
|
||||
|
||||
Assertions.assertEquals(3, verificationDataset.count());
|
||||
Assertions.assertEquals(6, verificationDataset.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
|
@ -132,10 +132,10 @@ public class SplitPerFunderTest {
|
|||
Assertions.assertEquals(1, tmp.count());
|
||||
|
||||
// H2020 3
|
||||
tmp = sc
|
||||
.textFile(workingDir.toString() + "/split/EC_H2020")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, CommunityResult.class));
|
||||
Assertions.assertEquals(3, tmp.count());
|
||||
// tmp = sc
|
||||
// .textFile(workingDir.toString() + "/split/EC_H2020")
|
||||
// .map(item -> OBJECT_MAPPER.readValue(item, CommunityResult.class));
|
||||
// Assertions.assertEquals(3, tmp.count());
|
||||
|
||||
// MZOS 1
|
||||
tmp = sc
|
||||
|
|
Loading…
Reference in New Issue