package eu.dnetlib.dhp.sx.graph; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; import com.jayway.jsonpath.JsonPath; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import net.minidev.json.JSONArray; /** * This Job extracts a typology of entity and stores it in a new RDD This job is called different times, for each file * generated by the Job {@link ImportDataFromMongo} and store the new RDD in a path that should be under a folder: * extractedEntities/entity/version1 *
* at the end of this process we will have : extractedEntities/dataset/version1 extractedEntities/dataset/version2
* extractedEntities/dataset/... extractedEntities/publication/version1 extractedEntities/publication/version2
* extractedEntities/publication/... extractedEntities/unknown/version1 extractedEntities/unknown/version2
* extractedEntities/unknown/... extractedEntities/relation/version1 extractedEntities/relation/version2
* extractedEntities/relation/...
*/
public class SparkExtractEntitiesJob {
static final String IDJSONPATH = "$.id";
static final String SOURCEJSONPATH = "$.source";
static final String TARGETJSONPATH = "$.target";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkExtractEntitiesJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(SparkExtractEntitiesJob.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String inputPath = parser.get("sourcePath");
final String targetPath = parser.get("targetPath");
final String tdir = parser.get("targetDir");
final JavaRDD