implementation of the java version of the graph processor
This commit is contained in:
parent
6c47fb0e67
commit
fb2eed9f0e
|
@ -1,2 +1 @@
|
||||||
# Thu Mar 31 12:53:27 CEST 2022
|
# Tue Apr 19 15:27:59 CEST 2022
|
||||||
projectPropertyKey=projectPropertyValue
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
package eu.dnetlib;
|
package eu.dnetlib;
|
||||||
|
|
||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
import eu.dnetlib.graph.GraphProcessor;
|
import eu.dnetlib.graph.JavaGraphProcessor;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
import eu.dnetlib.pace.util.BlockProcessorForTesting;
|
import eu.dnetlib.pace.util.BlockProcessorForTesting;
|
||||||
|
@ -19,7 +19,6 @@ import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.PairFunction;
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
import org.apache.spark.graphx.Edge;
|
import org.apache.spark.graphx.Edge;
|
||||||
import org.apache.spark.rdd.RDD;
|
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
@ -138,16 +137,15 @@ public class Deduper implements Serializable {
|
||||||
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
|
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
|
||||||
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
|
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
|
||||||
|
|
||||||
final RDD<Edge<String>> edgeRdd = spark
|
final JavaRDD<Edge<String>> edgeRdd = spark
|
||||||
.read()
|
.read()
|
||||||
.load(simRelsPath)
|
.load(simRelsPath)
|
||||||
.as(Encoders.bean(Relation.class))
|
.as(Encoders.bean(Relation.class))
|
||||||
.javaRDD()
|
.javaRDD()
|
||||||
.map(Relation::toEdgeRdd)
|
.map(Relation::toEdgeRdd);
|
||||||
.rdd();
|
|
||||||
|
|
||||||
JavaRDD<ConnectedComponent> ccs = GraphProcessor
|
JavaRDD<ConnectedComponent> ccs = JavaGraphProcessor
|
||||||
.findCCs(vertexes.rdd(), edgeRdd, maxIterations)
|
.findCCs(vertexes, edgeRdd, maxIterations)
|
||||||
.toJavaRDD();
|
.toJavaRDD();
|
||||||
|
|
||||||
JavaRDD<Relation> mergeRel = ccs
|
JavaRDD<Relation> mergeRel = ccs
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
package eu.dnetlib.graph;
|
||||||
|
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
import eu.dnetlib.support.ConnectedComponent;
|
||||||
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.graphx.*;
|
||||||
|
import org.apache.spark.rdd.RDD;
|
||||||
|
import org.apache.spark.storage.StorageLevel;
|
||||||
|
import scala.Tuple2;
|
||||||
|
import scala.reflect.ClassTag;
|
||||||
|
import scala.reflect.ClassTag$;
|
||||||
|
|
||||||
|
public class JavaGraphProcessor {
|
||||||
|
|
||||||
|
public static RDD<ConnectedComponent> findCCs(JavaPairRDD<Object, String> vertexes, JavaRDD<Edge<String>> edges, int maxIterations) {
|
||||||
|
|
||||||
|
ClassTag<String> stringTag = ClassTag$.MODULE$.apply(String.class);
|
||||||
|
Graph<String, String> graph =
|
||||||
|
Graph.apply(
|
||||||
|
vertexes.rdd(),
|
||||||
|
edges.rdd(),
|
||||||
|
"",
|
||||||
|
StorageLevel.MEMORY_ONLY(),
|
||||||
|
StorageLevel.MEMORY_ONLY(),
|
||||||
|
stringTag,
|
||||||
|
stringTag
|
||||||
|
);
|
||||||
|
|
||||||
|
GraphOps<String, String> graphOps = new GraphOps<>(graph, stringTag, stringTag);
|
||||||
|
JavaRDD<Tuple2<Object, Object>> cc = graphOps.connectedComponents(maxIterations).vertices().toJavaRDD();
|
||||||
|
|
||||||
|
JavaPairRDD<Object, String> joinResult = vertexes
|
||||||
|
.leftOuterJoin(cc.mapToPair(x -> x))
|
||||||
|
.mapToPair(x -> {
|
||||||
|
if (!x._2()._2().isPresent()) {
|
||||||
|
return new Tuple2<>(x._1(), x._2()._1());
|
||||||
|
} else {
|
||||||
|
return new Tuple2<>(x._2()._2(), x._2()._1());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return joinResult.groupByKey().map(x -> new ConnectedComponent(Sets.newHashSet(x._2()))).rdd();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,7 +1,7 @@
|
||||||
package eu.dnetlib.jobs;
|
package eu.dnetlib.jobs;
|
||||||
|
|
||||||
import eu.dnetlib.Deduper;
|
import eu.dnetlib.Deduper;
|
||||||
import eu.dnetlib.graph.GraphProcessor;
|
import eu.dnetlib.graph.JavaGraphProcessor;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
import eu.dnetlib.pace.utils.Utility;
|
import eu.dnetlib.pace.utils.Utility;
|
||||||
|
@ -78,16 +78,15 @@ public class SparkCreateMergeRels extends AbstractSparkJob {
|
||||||
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
|
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
|
||||||
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
|
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
|
||||||
|
|
||||||
final RDD<Edge<String>> edgeRdd = spark
|
final JavaRDD<Edge<String>> edgeRdd = spark
|
||||||
.read()
|
.read()
|
||||||
.load(workingPath + "/simrels")
|
.load(workingPath + "/simrels")
|
||||||
.as(Encoders.bean(Relation.class))
|
.as(Encoders.bean(Relation.class))
|
||||||
.javaRDD()
|
.javaRDD()
|
||||||
.map(Relation::toEdgeRdd)
|
.map(Relation::toEdgeRdd);
|
||||||
.rdd();
|
|
||||||
|
|
||||||
JavaRDD<ConnectedComponent> ccs = GraphProcessor
|
JavaRDD<ConnectedComponent> ccs = JavaGraphProcessor
|
||||||
.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations())
|
.findCCs(vertexes, edgeRdd, dedupConf.getWf().getMaxIterations())
|
||||||
.toJavaRDD();
|
.toJavaRDD();
|
||||||
|
|
||||||
JavaRDD<Relation> mergeRel = ccs
|
JavaRDD<Relation> mergeRel = ccs
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.pace;
|
package eu.dnetlib.pace;
|
||||||
|
|
||||||
import eu.dnetlib.Deduper;
|
import eu.dnetlib.Deduper;
|
||||||
|
import eu.dnetlib.graph.JavaGraphProcessor;
|
||||||
import eu.dnetlib.jobs.SparkCreateDedupEntity;
|
import eu.dnetlib.jobs.SparkCreateDedupEntity;
|
||||||
import eu.dnetlib.jobs.SparkCreateMergeRels;
|
import eu.dnetlib.jobs.SparkCreateMergeRels;
|
||||||
import eu.dnetlib.jobs.SparkCreateSimRels;
|
import eu.dnetlib.jobs.SparkCreateSimRels;
|
||||||
|
@ -11,6 +12,7 @@ import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
import eu.dnetlib.pace.utils.Utility;
|
import eu.dnetlib.pace.utils.Utility;
|
||||||
import eu.dnetlib.support.ArgumentApplicationParser;
|
import eu.dnetlib.support.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.support.Block;
|
import eu.dnetlib.support.Block;
|
||||||
|
import eu.dnetlib.support.ConnectedComponent;
|
||||||
import eu.dnetlib.support.Relation;
|
import eu.dnetlib.support.Relation;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
@ -21,6 +23,8 @@ import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.ForeachFunction;
|
import org.apache.spark.api.java.function.ForeachFunction;
|
||||||
import org.apache.spark.api.java.function.PairFunction;
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
|
import org.apache.spark.graphx.Edge;
|
||||||
|
import org.apache.spark.rdd.RDD;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
@ -179,10 +183,10 @@ public class DedupLocalTest extends DedupTestUtils {
|
||||||
|
|
||||||
//custom parameters for this test
|
//custom parameters for this test
|
||||||
DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS(
|
DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS(
|
||||||
Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pub.instancetype.tree.conf.json").toURI()).toFile().getAbsolutePath()
|
Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pubs.fdup.exp.json").toURI()).toFile().getAbsolutePath()
|
||||||
));
|
));
|
||||||
|
|
||||||
String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.dump.1000.json").toURI()).toFile().getAbsolutePath();
|
String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.to.fix.json").toURI()).toFile().getAbsolutePath();
|
||||||
|
|
||||||
String simRelsPath = workingPath + "/simrels";
|
String simRelsPath = workingPath + "/simrels";
|
||||||
String mergeRelsPath = workingPath + "/mergerels";
|
String mergeRelsPath = workingPath + "/mergerels";
|
||||||
|
|
Loading…
Reference in New Issue