[DUMP DELTA PROJECTS] refactoring

2022-06-27 18:04:59 +02:00 · 2022-06-27 18:04:59 +02:00 · 1d1fe3b151
parent edddfc6c63
commit 1d1fe3b151
1 changed files with 106 additions and 98 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/projectssubset/ProjectSubsetTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/projectssubset/ProjectSubsetTest.java
@ -1,8 +1,11 @@
 package eu.dnetlib.dhp.oa.graph.dump.projectssubset;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.HashMap;
 import org.apache.commons.io.FileUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
@ -15,104 +18,109 @@ import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
-import eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkResultLinkedToProject;
+
 import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
 import eu.dnetlib.dhp.schema.dump.oaf.graph.Project;
 public class ProjectSubsetTest {
-    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
-    private static SparkSession spark;
+	private static SparkSession spark;
-    private static Path workingDir;
+	private static Path workingDir;
-    private static final Logger log = LoggerFactory
+	private static final Logger log = LoggerFactory
-            .getLogger(eu.dnetlib.dhp.oa.graph.dump.projectssubset.ProjectSubsetTest.class);
+		.getLogger(eu.dnetlib.dhp.oa.graph.dump.projectssubset.ProjectSubsetTest.class);
-    private static final HashMap<String, String> map = new HashMap<>();
+
-    @BeforeAll
+
-    public static void beforeAll() throws IOException {
+	@BeforeAll
-        workingDir = Files
+	public static void beforeAll() throws IOException {
-                .createTempDirectory(
+		workingDir = Files
-                        eu.dnetlib.dhp.oa.graph.dump.projectssubset.ProjectSubsetTest.class.getSimpleName());
+			.createTempDirectory(
-        log.info("using work dir {}", workingDir);
+				eu.dnetlib.dhp.oa.graph.dump.projectssubset.ProjectSubsetTest.class.getSimpleName());
-        SparkConf conf = new SparkConf();
+		log.info("using work dir {}", workingDir);
-        conf.setAppName(eu.dnetlib.dhp.oa.graph.dump.projectssubset.ProjectSubsetTest.class.getSimpleName());
+		SparkConf conf = new SparkConf();
-        conf.setMaster("local[*]");
+		conf.setAppName(eu.dnetlib.dhp.oa.graph.dump.projectssubset.ProjectSubsetTest.class.getSimpleName());
-        conf.set("spark.driver.host", "localhost");
+		conf.setMaster("local[*]");
-        conf.set("hive.metastore.local", "true");
+		conf.set("spark.driver.host", "localhost");
-        conf.set("spark.ui.enabled", "false");
+		conf.set("hive.metastore.local", "true");
-        conf.set("spark.sql.warehouse.dir", workingDir.toString());
+		conf.set("spark.ui.enabled", "false");
-        conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
+		conf.set("spark.sql.warehouse.dir", workingDir.toString());
-        spark = SparkSession
+		conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
-                .builder()
+		spark = SparkSession
-                .appName(eu.dnetlib.dhp.oa.graph.dump.projectssubset.ProjectSubsetTest.class.getSimpleName())
+			.builder()
-                .config(conf)
+			.appName(eu.dnetlib.dhp.oa.graph.dump.projectssubset.ProjectSubsetTest.class.getSimpleName())
-                .getOrCreate();
+			.config(conf)
-    }
+			.getOrCreate();
-    @AfterAll
+	}
-    public static void afterAll() throws IOException {
+
-        FileUtils.deleteDirectory(workingDir.toFile());
+	@AfterAll
-        spark.stop();
+	public static void afterAll() throws IOException {
-    }
+		FileUtils.deleteDirectory(workingDir.toFile());
-    @Test
+		spark.stop();
-    void testAllNew() throws Exception {
+	}
-        final String projectListPath = getClass()
+
-                .getResource("/eu/dnetlib/dhp/oa/graph/dump/projectsubset/projectId")
+	@Test
-                .getPath();
+	void testAllNew() throws Exception {
-        final String sourcePath = getClass()
+		final String projectListPath = getClass()
-                .getResource("/eu/dnetlib/dhp/oa/graph/dump/projectsubset/allnew/projects")
+			.getResource("/eu/dnetlib/dhp/oa/graph/dump/projectsubset/projectId")
-                .getPath();
+			.getPath();
-        spark
+		final String sourcePath = getClass()
-                .read()
+			.getResource("/eu/dnetlib/dhp/oa/graph/dump/projectsubset/allnew/projects")
-                .textFile(projectListPath)
+			.getPath();
-                .write()
+		spark
-                .mode(SaveMode.Overwrite)
+			.read()
-                .text(workingDir.toString() + "/projectIds");
+			.textFile(projectListPath)
-        ProjectsSubsetSparkJob.main(new String[] {
+			.write()
-                "-isSparkSessionManaged", Boolean.FALSE.toString(),
+			.mode(SaveMode.Overwrite)
-                "-outputPath", workingDir.toString() + "/projects",
+			.text(workingDir.toString() + "/projectIds");
-                "-sourcePath", sourcePath,
+		ProjectsSubsetSparkJob.main(new String[] {
-                "-projectListPath", workingDir.toString() + "/projectIds"
+			"-isSparkSessionManaged", Boolean.FALSE.toString(),
-        });
+			"-outputPath", workingDir.toString() + "/projects",
-        final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+			"-sourcePath", sourcePath,
-        JavaRDD<Project> tmp = sc
+			"-projectListPath", workingDir.toString() + "/projectIds"
-                .textFile(workingDir.toString() + "/projects")
+		});
-                .map(item -> OBJECT_MAPPER.readValue(item, Project.class));
+		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
-        Assertions.assertEquals(12, tmp.count());
+		JavaRDD<Project> tmp = sc
-        Assertions.assertEquals(2, tmp.filter(p -> p.getId().substring(3, 15).equals("aka_________")).count());
+			.textFile(workingDir.toString() + "/projects")
-        Assertions.assertEquals(2, tmp.filter(p -> p.getId().substring(3, 15).equals("anr_________")).count());
+			.map(item -> OBJECT_MAPPER.readValue(item, Project.class));
-        Assertions.assertEquals(4, tmp.filter(p -> p.getId().substring(3, 15).equals("arc_________")).count());
+		Assertions.assertEquals(12, tmp.count());
-        Assertions.assertEquals(3, tmp.filter(p -> p.getId().substring(3, 15).equals("conicytf____")).count());
+		Assertions.assertEquals(2, tmp.filter(p -> p.getId().substring(3, 15).equals("aka_________")).count());
-        Assertions.assertEquals(1, tmp.filter(p -> p.getId().substring(3, 15).equals("corda_______")).count());
+		Assertions.assertEquals(2, tmp.filter(p -> p.getId().substring(3, 15).equals("anr_________")).count());
-        Assertions.assertEquals(40, sc.textFile(workingDir.toString() + "/projectIds").count());
+		Assertions.assertEquals(4, tmp.filter(p -> p.getId().substring(3, 15).equals("arc_________")).count());
-    }
+		Assertions.assertEquals(3, tmp.filter(p -> p.getId().substring(3, 15).equals("conicytf____")).count());
-    @Test
+		Assertions.assertEquals(1, tmp.filter(p -> p.getId().substring(3, 15).equals("corda_______")).count());
-    void testMatchOne() throws Exception {
+		Assertions.assertEquals(40, sc.textFile(workingDir.toString() + "/projectIds").count());
-        final String projectListPath = getClass()
+	}
-                .getResource("/eu/dnetlib/dhp/oa/graph/dump/projectsubset/projectId")
+
-                .getPath();
+	@Test
-        final String sourcePath = getClass()
+	void testMatchOne() throws Exception {
-                .getResource("/eu/dnetlib/dhp/oa/graph/dump/projectsubset/matchOne/projects")
+		final String projectListPath = getClass()
-                .getPath();
+			.getResource("/eu/dnetlib/dhp/oa/graph/dump/projectsubset/projectId")
-        spark
+			.getPath();
-                .read()
+		final String sourcePath = getClass()
-                .textFile(projectListPath)
+			.getResource("/eu/dnetlib/dhp/oa/graph/dump/projectsubset/matchOne/projects")
-                .write()
+			.getPath();
-                .mode(SaveMode.Overwrite)
+		spark
-                .text(workingDir.toString() + "/projectIds");
+			.read()
-        ProjectsSubsetSparkJob.main(new String[] {
+			.textFile(projectListPath)
-                "-isSparkSessionManaged", Boolean.FALSE.toString(),
+			.write()
-                "-outputPath", workingDir.toString() + "/projects",
+			.mode(SaveMode.Overwrite)
-                "-sourcePath", sourcePath,
+			.text(workingDir.toString() + "/projectIds");
-                "-projectListPath", workingDir.toString() + "/projectIds"
+		ProjectsSubsetSparkJob.main(new String[] {
-        });
+			"-isSparkSessionManaged", Boolean.FALSE.toString(),
-        final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+			"-outputPath", workingDir.toString() + "/projects",
-        JavaRDD<Project> tmp = sc
+			"-sourcePath", sourcePath,
-                .textFile(workingDir.toString() + "/projects")
+			"-projectListPath", workingDir.toString() + "/projectIds"
-                .map(item -> OBJECT_MAPPER.readValue(item, Project.class));
+		});
-        Assertions.assertEquals(11, tmp.count());
+		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
-        Assertions.assertEquals(2, tmp.filter(p -> p.getId().substring(3, 15).equals("aka_________")).count());
+		JavaRDD<Project> tmp = sc
-        Assertions.assertEquals(2, tmp.filter(p -> p.getId().substring(3, 15).equals("anr_________")).count());
+			.textFile(workingDir.toString() + "/projects")
-        Assertions.assertEquals(4, tmp.filter(p -> p.getId().substring(3, 15).equals("arc_________")).count());
+			.map(item -> OBJECT_MAPPER.readValue(item, Project.class));
-        Assertions.assertEquals(3, tmp.filter(p -> p.getId().substring(3, 15).equals("conicytf____")).count());
+		Assertions.assertEquals(11, tmp.count());
-        Assertions.assertEquals(0, tmp.filter(p -> p.getId().substring(3, 15).equals("corda__h2020")).count());
+		Assertions.assertEquals(2, tmp.filter(p -> p.getId().substring(3, 15).equals("aka_________")).count());
-        Assertions.assertEquals(39, sc.textFile(workingDir.toString() + "/projectIds").count());
+		Assertions.assertEquals(2, tmp.filter(p -> p.getId().substring(3, 15).equals("anr_________")).count());
-    }
+		Assertions.assertEquals(4, tmp.filter(p -> p.getId().substring(3, 15).equals("arc_________")).count());
-}
+		Assertions.assertEquals(3, tmp.filter(p -> p.getId().substring(3, 15).equals("conicytf____")).count());
 		Assertions.assertEquals(0, tmp.filter(p -> p.getId().substring(3, 15).equals("corda__h2020")).count());
 		Assertions.assertEquals(39, sc.textFile(workingDir.toString() + "/projectIds").count());
 	}
 }