diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkPropagateRelationsJob.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java similarity index 93% rename from dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkPropagateRelationsJob.java rename to dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java index 9f48ce521..2896a2aa1 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkPropagateRelationsJob.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dedup.sx; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; @@ -7,14 +7,8 @@ import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.utils.DHPUtils; import org.apache.commons.io.IOUtils; -import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.Optional; -import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.*; import scala.Tuple2; diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkUpdateEntityJob.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java similarity index 98% rename from dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkUpdateEntityJob.java rename to dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java index 396349481..6039e5526 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkUpdateEntityJob.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dedup.sx; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; @@ -15,7 +15,6 @@ import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.*; import scala.Tuple2; diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml index ddbf39e5f..46f334b1b 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml @@ -122,7 +122,7 @@ yarn-cluster cluster Propagate Dedup Relations - eu.dnetlib.dedup.SparkPropagateRelationsJob + eu.dnetlib.dedup.sx.SparkPropagateRelationsJob dhp-dedup-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} @@ -146,7 +146,7 @@ yarn-cluster cluster Update ${entity} and add DedupRecord - eu.dnetlib.dedup.SparkUpdateEntityJob + eu.dnetlib.dedup.sx.SparkUpdateEntityJob dhp-dedup-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/conf/pub_scholix.conf.json b/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/sx/conf/pub_scholix.conf.json similarity index 100% rename from dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/conf/pub_scholix.conf.json rename to dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/sx/conf/pub_scholix.conf.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/TargetFunction.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/TargetFunction.java deleted file mode 100644 index 31a554a63..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/TargetFunction.java +++ /dev/null @@ -1,15 +0,0 @@ -package eu.dnetlib.dhp.graph.scholexplorer; - - -import eu.dnetlib.dhp.schema.oaf.Relation; -import org.apache.commons.lang3.StringUtils; -import org.apache.spark.api.java.function.MapFunction; - -public class TargetFunction implements MapFunction { - @Override - public Relation call(Relation relation) throws Exception { - final String type = StringUtils.substringBefore(relation.getSource(), "|"); - relation.setTarget(String.format("%s|%s", type, StringUtils.substringAfter(relation.getTarget(),"::"))); - return relation; - } -} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/ImportDataFromMongo.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/ImportDataFromMongo.java similarity index 72% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/ImportDataFromMongo.java rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/ImportDataFromMongo.java index 2357c3787..8994e9667 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/ImportDataFromMongo.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/ImportDataFromMongo.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dhp.graph.scholexplorer; +package eu.dnetlib.dhp.graph.sx; import com.mongodb.DBObject; import com.mongodb.MongoClient; @@ -16,7 +16,6 @@ import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.bson.Document; import org.bson.conversions.Bson; - import java.io.IOException; import java.net.URI; import java.util.ArrayList; @@ -26,14 +25,52 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Consumer; import java.util.stream.Collectors; +/** + * This job is responsible to collect + * data from mongoDatabase and store in a sequence File on HDFS + * Mongo database contains information of each MDSTore in two collections: + * -metadata + * That contains info like: + * ID, format, layout, interpretation + * -metadataManager: + * that contains info : + * ID, mongoCollectionName + * from the metadata collection we filter the ids with Format, layout, and Interpretation + * from the metadataManager we get the current MONGO collection name which contains metadata XML + * see function getCurrentId + * + * This Job will be called different times in base at the triple we want import, + * and generates for each triple a sequence file of XML + * + */ + public class ImportDataFromMongo { + /** + * It requires in input some parameters described on a file eu/dnetlib/dhp/graph/sx/import_from_mongo_parameters.json + * + * - the name node + * - the paht where store HDFS File + * - the mongo host + * - the mongo port + * - the metadata format to import + * - the metadata layout to import + * - the metadata interpretation to import + * - the mongo database Name + * + * This params are encoded into args + * + * - + * + * + * @param args + * @throws Exception + */ public static void main(String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils.toString( ImportDataFromMongo.class.getResourceAsStream( - "/eu/dnetlib/dhp/graph/import_from_mongo_parameters.json"))); + "/eu/dnetlib/dhp/sx/graph/argumentparser/import_from_mongo_parameters.json"))); parser.parseArgument(args); final int port = Integer.parseInt(parser.get("dbport")); final String host = parser.get("dbhost"); @@ -43,10 +80,7 @@ public class ImportDataFromMongo { final String interpretation = parser.get("interpretation"); final String dbName = parser.get("dbName"); - - final MongoClient client = new MongoClient(host, port); - MongoDatabase database = client.getDatabase(dbName); MongoCollection metadata = database.getCollection("metadata"); @@ -55,6 +89,8 @@ public class ImportDataFromMongo { final List ids = new ArrayList<>(); metadata.find((Bson) query).forEach((Consumer) document -> ids.add(document.getString("mdId"))); List databaseId = ids.stream().map(it -> getCurrentId(it, metadataManager)).filter(Objects::nonNull).collect(Collectors.toList()); + + final String hdfsuri = parser.get("namenode"); // ====== Init HDFS File System Object Configuration conf = new Configuration(); @@ -64,8 +100,6 @@ public class ImportDataFromMongo { conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - System.setProperty("HADOOP_USER_NAME", parser.get("user")); - System.setProperty("hadoop.home.dir", "/"); FileSystem.get(URI.create(hdfsuri), conf); Path hdfswritepath = new Path(parser.get("targetPath")); @@ -92,13 +126,17 @@ public class ImportDataFromMongo { throw new RuntimeException(e); } } - ); }); } } - + /** + * Return the name of mongo collection giving an MdStore ID + * @param mdId The id of the MDStore + * @param metadataManager The collection metadataManager on mongo which contains this information + * @return + */ private static String getCurrentId(final String mdId, final MongoCollection metadataManager) { FindIterable result = metadataManager.find((Bson) QueryBuilder.start("mdId").is(mdId).get()); final Document item = result.first(); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkExtractEntitiesJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/SparkExtractEntitiesJob.java similarity index 79% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkExtractEntitiesJob.java rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/SparkExtractEntitiesJob.java index cabca4e5c..9f5a91d3c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkExtractEntitiesJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/SparkExtractEntitiesJob.java @@ -1,8 +1,7 @@ -package eu.dnetlib.dhp.graph.scholexplorer; +package eu.dnetlib.dhp.graph.sx; import com.jayway.jsonpath.JsonPath; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.graph.SparkGraphImporterJob; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.compress.GzipCodec; @@ -16,6 +15,27 @@ import java.util.List; import java.util.stream.Collectors; +/** + * This Job extracts a typology of entity and stores it in a new RDD + * This job is called different times, for each file generated by the Job {@link ImportDataFromMongo} + * and store the new RDD in a path that should be under a folder: + * extractedEntities/entity/version1 + * + * at the end of this process we will have : + * extractedEntities/dataset/version1 + * extractedEntities/dataset/version2 + * extractedEntities/dataset/... + * extractedEntities/publication/version1 + * extractedEntities/publication/version2 + * extractedEntities/publication/... + * extractedEntities/unknown/version1 + * extractedEntities/unknown/version2 + * extractedEntities/unknown/... + * extractedEntities/relation/version1 + * extractedEntities/relation/version2 + * extractedEntities/relation/... + */ + public class SparkExtractEntitiesJob { final static String IDJSONPATH = "$.id"; final static String SOURCEJSONPATH = "$.source"; @@ -27,7 +47,7 @@ public class SparkExtractEntitiesJob { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils.toString( SparkExtractEntitiesJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/graph/input_extract_entities_parameters.json"))); + "/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json"))); parser.parseArgument(args); final SparkSession spark = SparkSession .builder() diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGenerateSimRel.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/SparkSXGeneratePidSimlarity.java similarity index 81% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGenerateSimRel.java rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/SparkSXGeneratePidSimlarity.java index aea763b85..c3e55ca2f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGenerateSimRel.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/SparkSXGeneratePidSimlarity.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dhp.graph.scholexplorer; +package eu.dnetlib.dhp.graph.sx; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.utils.DHPUtils; @@ -12,13 +12,24 @@ import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import scala.Tuple2; -public class SparkScholexplorerGenerateSimRel { + +/** + * In some case the identifier generated for the Entity in @{@link SparkExtractEntitiesJob} is different from the identifier + * * associated by the aggregator, this means that some relation points to missing identifier + * To avoid this problem we store in the model the Id and the OriginalObJIdentifier + * This jobs extract this pair and creates a Similar relation that will be used in SparkMergeEntities + * + */ + +public class SparkSXGeneratePidSimlarity { final static String IDJSONPATH = "$.id"; final static String OBJIDPATH = "$.originalObjIdentifier"; + + public static void generateDataFrame(final SparkSession spark, final JavaSparkContext sc, final String inputPath, final String targetPath) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerMergeEntitiesJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/SparkScholexplorerCreateRawGraphJob.java similarity index 81% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerMergeEntitiesJob.java rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/SparkScholexplorerCreateRawGraphJob.java index 41ed137d6..5d8a35c1b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerMergeEntitiesJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/SparkScholexplorerCreateRawGraphJob.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dhp.graph.scholexplorer; +package eu.dnetlib.dhp.graph.sx; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; @@ -11,6 +11,7 @@ import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; import eu.dnetlib.dhp.utils.DHPUtils; import net.minidev.json.JSONArray; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -32,7 +33,27 @@ import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; -public class SparkScholexplorerMergeEntitiesJob { + +/** + * This job is responsible of the creation of RAW Graph + * It is applied to the different entities generated from {@link SparkExtractEntitiesJob} + * In case of dataset, publication and Unknown Entities + * we group all the entities of the same type by their identifier, + * and then in the reduce phase we merge all the entities. + * Merge means: + * -merge all the metadata + * -merge the collected From values + * + * In case of relation we need to make a different work: + * -Phase 1: Map reduce jobs + * Map: Get all Relation and emit a key constructed by (source, relType, Target) and the relation itself + * Reduce: Merge all relations + * Looking at the javadoc of {@link SparkSXGeneratePidSimlarity} we take the dataset of pid relation + * and joining by source and target we replace the wrong identifier in the relation with the correct ones. + * At the end we replace the new Dataset of Relation + */ + +public class SparkScholexplorerCreateRawGraphJob { final static String IDJSONPATH = "$.id"; final static String SOURCEJSONPATH = "$.source"; @@ -44,22 +65,20 @@ public class SparkScholexplorerMergeEntitiesJob { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils.toString( - SparkScholexplorerMergeEntitiesJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/graph/merge_entities_scholix_parameters.json"))); + SparkScholexplorerCreateRawGraphJob.class.getResourceAsStream( + "/eu/dnetlib/dhp/sx/graph/argumentparser/merge_entities_scholix_parameters.json"))); parser.parseArgument(args); final SparkSession spark = SparkSession .builder() .config(new SparkConf() .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")) - .appName(SparkScholexplorerMergeEntitiesJob.class.getSimpleName()) + .appName(SparkScholexplorerCreateRawGraphJob.class.getSimpleName()) .master(parser.get("master")) .getOrCreate(); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final String inputPath = parser.get("sourcePath"); final String targetPath = parser.get("targetPath"); final String entity = parser.get("entity"); - - FileSystem fs = FileSystem.get(sc.sc().hadoopConfiguration()); List subFolder = Arrays.stream(fs.listStatus(new Path(inputPath))).filter(FileStatus::isDirectory).map(FileStatus::getPath).collect(Collectors.toList()); List> inputRdd = new ArrayList<>(); @@ -113,7 +132,9 @@ public class SparkScholexplorerMergeEntitiesJob { break; case "relation": - SparkScholexplorerGenerateSimRel.generateDataFrame(spark, sc, inputPath.replace("/relation",""),targetPath.replace("/relation","") ); + + + SparkSXGeneratePidSimlarity.generateDataFrame(spark, sc, inputPath.replace("/relation",""),targetPath.replace("/relation","") ); RDD rdd = union.mapToPair((PairFunction) f -> { final String source = getJPathString(SOURCEJSONPATH, f); final String target = getJPathString(TARGETJSONPATH, f); @@ -132,9 +153,13 @@ public class SparkScholexplorerMergeEntitiesJob { System.out.println("LOADING PATH :"+targetPath.replace("/relation","")+"/pid_simRel"); Datasetsim_ds =spark.read().load(targetPath.replace("/relation","")+"/pid_simRel").as(Encoders.bean(Relation.class)); - TargetFunction tf = new TargetFunction(); - - Dataset ids = sim_ds.map(tf, Encoders.bean(Relation.class)); + Dataset ids = sim_ds.map((MapFunction) relation-> + { + final String type = StringUtils.substringBefore(relation.getSource(), "|"); + relation.setTarget(String.format("%s|%s", type, StringUtils.substringAfter(relation.getTarget(),"::"))); + return relation; + } + , Encoders.bean(Relation.class)); final Dataset firstJoin = rel_ds diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGraphImporter.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/SparkScholexplorerGraphImporter.java similarity index 83% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGraphImporter.java rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/SparkScholexplorerGraphImporter.java index 6cbfab327..96e2c0826 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGraphImporter.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/SparkScholexplorerGraphImporter.java @@ -1,9 +1,9 @@ -package eu.dnetlib.dhp.graph.scholexplorer; +package eu.dnetlib.dhp.graph.sx; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.graph.scholexplorer.parser.DatasetScholexplorerParser; -import eu.dnetlib.dhp.graph.scholexplorer.parser.PublicationScholexplorerParser; +import eu.dnetlib.dhp.graph.sx.parser.DatasetScholexplorerParser; +import eu.dnetlib.dhp.graph.sx.parser.PublicationScholexplorerParser; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.scholexplorer.relation.RelationMapper; import org.apache.commons.io.IOUtils; @@ -15,6 +15,12 @@ import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.sql.SparkSession; import scala.Tuple2; + +/** + * This Job read a sequential File containing XML stored in the aggregator + * and generates an RDD of heterogeneous entities like Dataset, Relation, Publication and Unknown + */ + public class SparkScholexplorerGraphImporter { public static void main(String[] args) throws Exception { @@ -22,7 +28,7 @@ public class SparkScholexplorerGraphImporter { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils.toString( SparkScholexplorerGraphImporter.class.getResourceAsStream( - "/eu/dnetlib/dhp/graph/input_graph_scholix_parameters.json"))); + "/eu/dnetlib/dhp/sx/graph/argumentparser/input_graph_scholix_parameters.json"))); parser.parseArgument(args); final SparkSession spark = SparkSession diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/parser/AbstractScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/parser/AbstractScholexplorerParser.java similarity index 98% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/parser/AbstractScholexplorerParser.java rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/parser/AbstractScholexplorerParser.java index 6f3aa68d2..f3f81013c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/parser/AbstractScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/parser/AbstractScholexplorerParser.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dhp.graph.scholexplorer.parser; +package eu.dnetlib.dhp.graph.sx.parser; import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/parser/DatasetScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/parser/DatasetScholexplorerParser.java similarity index 99% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/parser/DatasetScholexplorerParser.java rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/parser/DatasetScholexplorerParser.java index 21545092b..7e3f06e22 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/parser/DatasetScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/parser/DatasetScholexplorerParser.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dhp.graph.scholexplorer.parser; +package eu.dnetlib.dhp.graph.sx.parser; import com.ximpleware.AutoPilot; import com.ximpleware.VTDGen; @@ -13,7 +13,6 @@ import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node; import eu.dnetlib.scholexplorer.relation.RelInfo; import eu.dnetlib.scholexplorer.relation.RelationMapper; import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.yarn.webapp.hamlet.Hamlet; import java.util.ArrayList; import java.util.Arrays; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/parser/PublicationScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/parser/PublicationScholexplorerParser.java similarity index 99% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/parser/PublicationScholexplorerParser.java rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/parser/PublicationScholexplorerParser.java index d5cf94a77..456b19064 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/parser/PublicationScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/sx/parser/PublicationScholexplorerParser.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dhp.graph.scholexplorer.parser; +package eu.dnetlib.dhp.graph.sx.parser; import com.ximpleware.AutoPilot; import com.ximpleware.VTDGen; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/convertXmlToEntities/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/convertXmlToEntities/oozie_app/workflow.xml deleted file mode 100644 index c7f628b6d..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/convertXmlToEntities/oozie_app/workflow.xml +++ /dev/null @@ -1,90 +0,0 @@ - - - - - sourcePath - the source path - - - hive_db_name - the target hive database name - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - - - ${jobTracker} - ${nameNode} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - yarn - cluster - MapGraphAsHiveDB - eu.dnetlib.dhp.oa.graph.SparkGraphImporterJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory ${sparkExecutorMemory} - --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" - --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" - --conf spark.sql.warehouse.dir="/user/hive/warehouse" - - -mt yarn - -s${sourcePath} - -db${hive_db_name} - -h${hive_metastore_uris} - - - - - - - - ${jobTracker} - ${nameNode} - - - hive.metastore.uris - ${hive_metastore_uris} - - - ${hive_jdbc_url}/${hive_db_name} - - hive_db_name=${hive_db_name} - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/mergeEntities/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/mergeEntities/oozie_app/config-default.xml deleted file mode 100644 index 6fb2a1253..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/mergeEntities/oozie_app/config-default.xml +++ /dev/null @@ -1,10 +0,0 @@ - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/generate_sim_rel_scholix_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/argumentparser/generate_sim_rel_scholix_parameters.json similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/generate_sim_rel_scholix_parameters.json rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/argumentparser/generate_sim_rel_scholix_parameters.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/import_from_mongo_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/argumentparser/import_from_mongo_parameters.json similarity index 73% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/import_from_mongo_parameters.json rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/argumentparser/import_from_mongo_parameters.json index 9032be287..ab8e760b2 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/import_from_mongo_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/argumentparser/import_from_mongo_parameters.json @@ -1,12 +1,11 @@ [ {"paramName":"n", "paramLongName":"namenode", "paramDescription": "the name node", "paramRequired": true}, - {"paramName":"u", "paramLongName":"user", "paramDescription": "the name node", "paramRequired": true}, - {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the name node", "paramRequired": true}, + {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the paht where store HDFS File", "paramRequired": true}, {"paramName":"h", "paramLongName":"dbhost", "paramDescription": "the mongo host", "paramRequired": true}, {"paramName":"p", "paramLongName":"dbport", "paramDescription": "the mongo port", "paramRequired": true}, {"paramName":"f", "paramLongName":"format", "paramDescription": "the metadata format to import", "paramRequired": true}, {"paramName":"l", "paramLongName":"layout", "paramDescription": "the metadata layout to import", "paramRequired": true}, {"paramName":"i", "paramLongName":"interpretation", "paramDescription": "the metadata interpretation to import", "paramRequired": true}, - {"paramName":"dn", "paramLongName":"dbName", "paramDescription": "the database Name", "paramRequired": true} + {"paramName":"dn", "paramLongName":"dbName", "paramDescription": "the mongo database Name", "paramRequired": true} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/input_extract_entities_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/input_extract_entities_parameters.json rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/input_graph_scholix_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/argumentparser/input_graph_scholix_parameters.json similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/input_graph_scholix_parameters.json rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/argumentparser/input_graph_scholix_parameters.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/merge_entities_scholix_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/argumentparser/merge_entities_scholix_parameters.json similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/merge_entities_scholix_parameters.json rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/argumentparser/merge_entities_scholix_parameters.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/relations.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/relations.json similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/relations.json rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/relations.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/convertXmlToEntities/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step1/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/convertXmlToEntities/oozie_app/config-default.xml rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step1/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/importMongoDbToHdfs/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step1/oozie_app/workflow.xml similarity index 91% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/importMongoDbToHdfs/oozie_app/workflow.xml rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step1/oozie_app/workflow.xml index 35aa173c6..918cc652a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/importMongoDbToHdfs/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step1/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + workingPath @@ -55,7 +55,7 @@ ${jobTracker} ${nameNode} - eu.dnetlib.dhp.graph.scholexplorer.ImportDataFromMongo + eu.dnetlib.dhp.graph.sx.ImportDataFromMongo -t${targetPath} -n${nameNode} -u${user} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/extractEntities/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step2/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/extractEntities/oozie_app/config-default.xml rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step2/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/extractEntities/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step2/oozie_app/workflow.xml similarity index 93% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/extractEntities/oozie_app/workflow.xml rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step2/oozie_app/workflow.xml index 6caa8b1c3..01fdec2ef 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/extractEntities/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step2/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + sourcePath @@ -54,7 +54,7 @@ yarn-cluster cluster Extract ${entities} - eu.dnetlib.dhp.graph.scholexplorer.SparkExtractEntitiesJob + eu.dnetlib.dhp.graph.sx.SparkExtractEntitiesJob dhp-graph-mapper-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/importMongoDbToHdfs/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step3/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/importMongoDbToHdfs/oozie_app/config-default.xml rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step3/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/mergeEntities/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step3/oozie_app/workflow.xml similarity index 90% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/mergeEntities/oozie_app/workflow.xml rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step3/oozie_app/workflow.xml index 44c6004e2..cf66ab6e6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/mergeEntities/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step3/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + sourcePath @@ -45,7 +45,7 @@ yarn-cluster cluster Merge ${entity} - eu.dnetlib.dhp.graph.scholexplorer.SparkScholexplorerMergeEntitiesJob + eu.dnetlib.dhp.graph.sx.SparkScholexplorerCreateRawGraphJob dhp-graph-mapper-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} -mt yarn-cluster diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/ScholexplorerParserTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/sx/ScholexplorerParserTest.java similarity index 89% rename from dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/ScholexplorerParserTest.java rename to dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/sx/ScholexplorerParserTest.java index 2185b7987..0717efe4a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/ScholexplorerParserTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/sx/ScholexplorerParserTest.java @@ -1,9 +1,9 @@ -package eu.dnetlib.dhp.graph.scholexplorer; +package eu.dnetlib.dhp.graph.sx; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; -import eu.dnetlib.dhp.graph.scholexplorer.parser.DatasetScholexplorerParser; +import eu.dnetlib.dhp.graph.sx.parser.DatasetScholexplorerParser; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.scholexplorer.relation.RelationMapper; import org.apache.commons.io.IOUtils; diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGraphImporterTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/sx/SparkScholexplorerGraphImporterTest.java similarity index 58% rename from dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGraphImporterTest.java rename to dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/sx/SparkScholexplorerGraphImporterTest.java index 505e7581a..f33340547 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGraphImporterTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/sx/SparkScholexplorerGraphImporterTest.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dhp.graph.scholexplorer; +package eu.dnetlib.dhp.graph.sx; diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerMergeEntitiesJobTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/sx/SparkScholexplorerMergeEntitiesJobTest.java similarity index 58% rename from dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerMergeEntitiesJobTest.java rename to dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/sx/SparkScholexplorerMergeEntitiesJobTest.java index 7a93c5834..623c38112 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerMergeEntitiesJobTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/sx/SparkScholexplorerMergeEntitiesJobTest.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dhp.graph.scholexplorer; +package eu.dnetlib.dhp.graph.sx; diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/scholexplorer/dmf.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sx/dmf.xml similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/scholexplorer/dmf.xml rename to dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sx/dmf.xml diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/scholexplorer/t.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sx/t.xml similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/scholexplorer/t.xml rename to dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sx/t.xml