changed method signature from set of mapkey entries to String representing path on file system where to find the map

2020-08-07 17:32:27 +02:00 · 2020-08-07 17:32:27 +02:00 · ae1b7fbfdb
parent 931fa2ff00
commit ae1b7fbfdb
1 changed files with 12 additions and 5 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/community/CommunitySplit.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/community/CommunitySplit.java
@ -1,10 +1,17 @@
-
+/**
+ * This class splits the dumped results according to the research community - research initiative/infrastructure
+ * they are related to. The information about the community is found in the element "context.id" in the result.
+ * Since the context that can be found in the result can be associated not only to communities, a community Map
+ * is provided. It will guide the splitting process.
+ * Note: the repartition(1) just before writing the results related to a community. This is a choice due
+ * to uploading constraints (just one file for each community) As soon as a better solution will be in place
+ * remove the repartition
+ */
 package eu.dnetlib.dhp.oa.graph.dump.community;

 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

 import java.io.Serializable;
-import java.util.Objects;
 import java.util.Optional;
 import java.util.Set;
 import java.util.stream.Collectors;
@ -19,19 +26,19 @@ import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;

 public class CommunitySplit implements Serializable {

-	public void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, CommunityMap communityMap) {
+	public void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, String communityMapPath) {
 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				Utils.removeOutputDir(spark, outputPath);
-				execSplit(spark, inputPath, outputPath, communityMap.keySet());// , inputClazz);
+				execSplit(spark, inputPath, outputPath, Utils.getCommunityMap(spark, communityMapPath).keySet());
 			});
 	}

 	private static void execSplit(SparkSession spark, String inputPath, String outputPath,
-		Set<String> communities) {// }, Class<R> inputClazz) {
+		Set<String> communities) {

 		Dataset<CommunityResult> result = Utils
 			.readPath(spark, inputPath + "/publication", CommunityResult.class)