This commit is contained in:
Miriam Baglioni 2021-12-13 15:01:40 +01:00
parent 4bb1d43afc
commit 8d755cca80
9 changed files with 94 additions and 76 deletions

View File

@ -22,7 +22,7 @@ public class SparkDumpEntitiesJob implements Serializable {
.toString(
SparkDumpEntitiesJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/wf/input_parameters.json"));
"/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -57,11 +57,10 @@ public class SparkDumpRelationJob implements Serializable {
Optional<String> rs = Optional.ofNullable(parser.get("removeSet"));
final Set<String> removeSet = new HashSet<>();
if(rs.isPresent()){
if (rs.isPresent()) {
Collections.addAll(removeSet, rs.get().split(";"));
}
SparkConf conf = new SparkConf();
runWithSparkSession(
@ -78,7 +77,7 @@ public class SparkDumpRelationJob implements Serializable {
private static void dumpRelation(SparkSession spark, String inputPath, String outputPath, Set<String> removeSet) {
Dataset<Relation> relations = Utils.readPath(spark, inputPath, Relation.class);
relations
.filter((FilterFunction<Relation>)r -> !removeSet.contains(r.getRelClass()))
.filter((FilterFunction<Relation>) r -> !removeSet.contains(r.getRelClass()))
.map((MapFunction<Relation, eu.dnetlib.dhp.schema.dump.oaf.graph.Relation>) relation -> {
eu.dnetlib.dhp.schema.dump.oaf.graph.Relation relNew = new eu.dnetlib.dhp.schema.dump.oaf.graph.Relation();
relNew

View File

@ -955,12 +955,39 @@ public class DumpJobTest {
"lateral view explode (instance) i as inst " +
"where inst.articleprocessingcharge is not null");
Assertions
.assertEquals(
"3131.64",
temp
.filter("id = '50|datacite____::05c611fdfc93d7a2a703d1324e28104a'")
.collectAsList()
.get(0)
.getString(1));
Assertions
.assertEquals(
"EUR",
temp
.filter("id = '50|datacite____::05c611fdfc93d7a2a703d1324e28104a'")
.collectAsList()
.get(0)
.getString(2));
Assertions.assertEquals("3131.64", temp.filter("id = '50|datacite____::05c611fdfc93d7a2a703d1324e28104a'").collectAsList().get(0).getString(1));
Assertions.assertEquals("EUR", temp.filter("id = '50|datacite____::05c611fdfc93d7a2a703d1324e28104a'").collectAsList().get(0).getString(2));
Assertions.assertEquals("2578.35", temp.filter("id = '50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8'").collectAsList().get(0).getString(1));
Assertions.assertEquals("EUR", temp.filter("id = '50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8'").collectAsList().get(0).getString(2));
Assertions
.assertEquals(
"2578.35",
temp
.filter("id = '50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8'")
.collectAsList()
.get(0)
.getString(1));
Assertions
.assertEquals(
"EUR",
temp
.filter("id = '50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8'")
.collectAsList()
.get(0)
.getString(2));
}
}

View File

@ -4,10 +4,8 @@ package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
@ -83,7 +81,6 @@ public class DumpRelationTest {
"-sourcePath", sourcePath
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
@ -145,7 +142,6 @@ public class DumpRelationTest {
"-sourcePath", sourcePath
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
@ -217,7 +213,6 @@ public class DumpRelationTest {
"-removeSet", "isParticipant"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
@ -249,7 +244,6 @@ public class DumpRelationTest {
Assertions.assertEquals(0, check.filter("name = 'isParticipant'").count());
Assertions.assertEquals(1, check.filter("name = 'isAuthorInstitutionOf'").count());
Assertions
.assertEquals(
@ -273,7 +267,6 @@ public class DumpRelationTest {
"-removeSet", "isParticipant;isAuthorInstitutionOf"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
@ -305,7 +298,6 @@ public class DumpRelationTest {
Assertions.assertEquals(0, check.filter("name = 'isParticipant'").count());
Assertions.assertEquals(0, check.filter("name = 'isAuthorInstitutionOf'").count());
}