changed the way to read the file with info on resource or relation. From sequenceFile to textFile

2020-03-17 16:32:05 +01:00 · 2020-03-17 16:32:05 +01:00 · 67ea3cf3ed
parent b4652d018c
commit 67ea3cf3ed
1 changed files with 21 additions and 45 deletions
--- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java
+++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java
@ -6,6 +6,8 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.Dataset;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
@ -14,6 +16,7 @@ import org.apache.spark.sql.*;
 import scala.Tuple2;
 import java.io.File;
 import java.io.IOException;
 import java.util.*;
 import static eu.dnetlib.dhp.PropagationConstant.*;
@ -34,45 +37,43 @@ public class SparkCountryPropagationJob {
        final String inputPath = parser.get("sourcePath");
        final String outputPath = "/tmp/provision/propagation/countrytoresultfrominstitutionalrepositories";
-        File directory = new File(outputPath);
+        createOutputDirs(outputPath, FileSystem.get(spark.sparkContext().hadoopConfiguration()));
        if(!directory.exists()){
            directory.mkdirs();
        }
        List<String> whitelist = Arrays.asList(parser.get("whitelist").split(";"));
        List<String> allowedtypes = Arrays.asList(parser.get("allowedtypes").split(";"));
-        JavaPairRDD<String, TypedRow> organizations = sc.sequenceFile(inputPath + "/organization", Text.class, Text.class)
+        JavaPairRDD<String, TypedRow> organizations = sc.textFile(inputPath + "/organization")
-                .map(item -> new ObjectMapper().readValue(item._2().toString(), Organization.class))
+                .map(item -> new ObjectMapper().readValue(item, Organization.class))
                .filter(org -> !org.getDataInfo().getDeletedbyinference())
                .map(org -> new TypedRow().setSourceId(org.getId()).setValue(org.getCountry().getClassid()))
                .mapToPair(toPair());
-        JavaPairRDD<String, TypedRow> organization_datasource = sc.sequenceFile(inputPath + "/relation", Text.class, Text.class)
+        JavaPairRDD<String, TypedRow> organization_datasource =
-                .map(item -> new ObjectMapper().readValue(item._2().toString(), Relation.class))
+                sc.textFile(inputPath + "/relation")
                .map(item -> new ObjectMapper().readValue(item, Relation.class))
                .filter(r -> !r.getDataInfo().getDeletedbyinference())
                .filter(r -> RELATION_DATASOURCEORGANIZATION_REL_TYPE.equals(r.getRelClass()) && RELATION_ORGANIZATION_DATASOURCE_REL_CLASS.equals(r.getRelType()))
                .map(r -> new TypedRow().setSourceId(r.getSource()).setTargetId(r.getTarget()))
                .mapToPair(toPair()); //id is the organization identifier
-        JavaPairRDD<String, TypedRow> datasources = sc.sequenceFile(inputPath + "/datasource", Text.class, Text.class)
+        JavaPairRDD<String, TypedRow> datasources = sc.textFile(inputPath + "/datasource")
-                .map(item -> new ObjectMapper().readValue(item._2().toString(), Datasource.class))
+                .map(item -> new ObjectMapper().readValue(item, Datasource.class))
                .filter(ds -> whitelist.contains(ds.getId()) || allowedtypes.contains(ds.getDatasourcetype().getClassid()))
                .map(ds -> new TypedRow().setSourceId(ds.getId()))
                .mapToPair(toPair());
-        JavaRDD<Publication> publications = sc.sequenceFile(inputPath + "/publication", Text.class, Text.class)
+        JavaRDD<Publication> publications = sc.textFile(inputPath + "/publication")
-                .map(item -> new ObjectMapper().readValue(item._2().toString(), Publication.class));
+                .map(item -> new ObjectMapper().readValue(item, Publication.class));
-        JavaRDD<Dataset> datasets = sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class)
+        JavaRDD<Dataset> datasets = sc.textFile(inputPath + "/dataset")
-                .map(item -> new ObjectMapper().readValue(item._2().toString(), Dataset.class));
+                .map(item -> new ObjectMapper().readValue(item, Dataset.class));
-        JavaRDD<Software> software = sc.sequenceFile(inputPath + "/software", Text.class, Text.class)
+        JavaRDD<Software> software = sc.textFile(inputPath + "/software")
-                .map(item -> new ObjectMapper().readValue(item._2().toString(), Software.class));
+                .map(item -> new ObjectMapper().readValue(item, Software.class));
-        JavaRDD<OtherResearchProduct> other = sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class)
+        JavaRDD<OtherResearchProduct> other = sc.textFile(inputPath + "/otherresearchproduct")
-                .map(item -> new ObjectMapper().readValue(item._2().toString(), OtherResearchProduct.class));
+                .map(item -> new ObjectMapper().readValue(item, OtherResearchProduct.class));
        JavaPairRDD<String, TypedRow> datasource_results = publications
                .map(oaf -> getTypedRowsDatasourceResult(oaf))
@ -147,6 +148,8 @@ public class SparkCountryPropagationJob {
    }
    private static void updateResult(JavaPairRDD<String, Result> results, JavaPairRDD<String, TypedRow> toupdateresult, String outputPath, String type) {
        results.leftOuterJoin(toupdateresult)
                .map(c -> {
@ -177,33 +180,6 @@ public class SparkCountryPropagationJob {
    private static JavaPairRDD<String, TypedRow> getResults(JavaSparkContext sc , String inputPath){
        return
                sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class)
                    .map(item -> new ObjectMapper().readValue(item._2().toString(), Dataset.class))
                        .filter(ds -> !ds.getDataInfo().getDeletedbyinference())
                    .map(oaf -> new TypedRow().setType("dataset").setSourceId(oaf.getId()))
                    .mapToPair(toPair())
                .union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class)
                        .map(item -> new ObjectMapper().readValue(item._2().toString(), OtherResearchProduct.class))
                        .filter(o -> !o.getDataInfo().getDeletedbyinference())
                        .map(oaf -> new TypedRow().setType("otherresearchproduct").setSourceId(oaf.getId()))
                        .mapToPair(toPair()))
                .union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class)
                        .map(item -> new ObjectMapper().readValue(item._2().toString(), Software.class))
                        .filter(s -> !s.getDataInfo().getDeletedbyinference())
                        .map(oaf -> new TypedRow().setType("software").setSourceId(oaf.getId()))
                        .mapToPair(toPair()))
                .union(sc.sequenceFile(inputPath + "/publication", Text.class, Text.class)
                                .map(item -> new ObjectMapper().readValue(item._2().toString(), Publication.class))
                        .filter(p -> !p.getDataInfo().getDeletedbyinference())
                                .map(oaf -> new TypedRow().setType("publication").setSourceId(oaf.getId()))
                                .mapToPair(toPair()));
    }