implementation of the java version of the graph processor
This commit is contained in:
parent
6c47fb0e67
commit
fb2eed9f0e
|
@ -1,2 +1 @@
|
|||
# Thu Mar 31 12:53:27 CEST 2022
|
||||
projectPropertyKey=projectPropertyValue
|
||||
# Tue Apr 19 15:27:59 CEST 2022
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
package eu.dnetlib;
|
||||
|
||||
import com.google.common.hash.Hashing;
|
||||
import eu.dnetlib.graph.GraphProcessor;
|
||||
import eu.dnetlib.graph.JavaGraphProcessor;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.util.BlockProcessorForTesting;
|
||||
|
@ -19,7 +19,6 @@ import org.apache.spark.api.java.JavaSparkContext;
|
|||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.graphx.Edge;
|
||||
import org.apache.spark.rdd.RDD;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
|
@ -138,16 +137,15 @@ public class Deduper implements Serializable {
|
|||
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
|
||||
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
|
||||
|
||||
final RDD<Edge<String>> edgeRdd = spark
|
||||
final JavaRDD<Edge<String>> edgeRdd = spark
|
||||
.read()
|
||||
.load(simRelsPath)
|
||||
.as(Encoders.bean(Relation.class))
|
||||
.javaRDD()
|
||||
.map(Relation::toEdgeRdd)
|
||||
.rdd();
|
||||
.map(Relation::toEdgeRdd);
|
||||
|
||||
JavaRDD<ConnectedComponent> ccs = GraphProcessor
|
||||
.findCCs(vertexes.rdd(), edgeRdd, maxIterations)
|
||||
JavaRDD<ConnectedComponent> ccs = JavaGraphProcessor
|
||||
.findCCs(vertexes, edgeRdd, maxIterations)
|
||||
.toJavaRDD();
|
||||
|
||||
JavaRDD<Relation> mergeRel = ccs
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
package eu.dnetlib.graph;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.support.ConnectedComponent;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.graphx.*;
|
||||
import org.apache.spark.rdd.RDD;
|
||||
import org.apache.spark.storage.StorageLevel;
|
||||
import scala.Tuple2;
|
||||
import scala.reflect.ClassTag;
|
||||
import scala.reflect.ClassTag$;
|
||||
|
||||
public class JavaGraphProcessor {
|
||||
|
||||
public static RDD<ConnectedComponent> findCCs(JavaPairRDD<Object, String> vertexes, JavaRDD<Edge<String>> edges, int maxIterations) {
|
||||
|
||||
ClassTag<String> stringTag = ClassTag$.MODULE$.apply(String.class);
|
||||
Graph<String, String> graph =
|
||||
Graph.apply(
|
||||
vertexes.rdd(),
|
||||
edges.rdd(),
|
||||
"",
|
||||
StorageLevel.MEMORY_ONLY(),
|
||||
StorageLevel.MEMORY_ONLY(),
|
||||
stringTag,
|
||||
stringTag
|
||||
);
|
||||
|
||||
GraphOps<String, String> graphOps = new GraphOps<>(graph, stringTag, stringTag);
|
||||
JavaRDD<Tuple2<Object, Object>> cc = graphOps.connectedComponents(maxIterations).vertices().toJavaRDD();
|
||||
|
||||
JavaPairRDD<Object, String> joinResult = vertexes
|
||||
.leftOuterJoin(cc.mapToPair(x -> x))
|
||||
.mapToPair(x -> {
|
||||
if (!x._2()._2().isPresent()) {
|
||||
return new Tuple2<>(x._1(), x._2()._1());
|
||||
} else {
|
||||
return new Tuple2<>(x._2()._2(), x._2()._1());
|
||||
}
|
||||
});
|
||||
|
||||
return joinResult.groupByKey().map(x -> new ConnectedComponent(Sets.newHashSet(x._2()))).rdd();
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,7 +1,7 @@
|
|||
package eu.dnetlib.jobs;
|
||||
|
||||
import eu.dnetlib.Deduper;
|
||||
import eu.dnetlib.graph.GraphProcessor;
|
||||
import eu.dnetlib.graph.JavaGraphProcessor;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import eu.dnetlib.pace.utils.Utility;
|
||||
|
@ -78,16 +78,15 @@ public class SparkCreateMergeRels extends AbstractSparkJob {
|
|||
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
|
||||
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
|
||||
|
||||
final RDD<Edge<String>> edgeRdd = spark
|
||||
final JavaRDD<Edge<String>> edgeRdd = spark
|
||||
.read()
|
||||
.load(workingPath + "/simrels")
|
||||
.as(Encoders.bean(Relation.class))
|
||||
.javaRDD()
|
||||
.map(Relation::toEdgeRdd)
|
||||
.rdd();
|
||||
.map(Relation::toEdgeRdd);
|
||||
|
||||
JavaRDD<ConnectedComponent> ccs = GraphProcessor
|
||||
.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations())
|
||||
JavaRDD<ConnectedComponent> ccs = JavaGraphProcessor
|
||||
.findCCs(vertexes, edgeRdd, dedupConf.getWf().getMaxIterations())
|
||||
.toJavaRDD();
|
||||
|
||||
JavaRDD<Relation> mergeRel = ccs
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace;
|
||||
|
||||
import eu.dnetlib.Deduper;
|
||||
import eu.dnetlib.graph.JavaGraphProcessor;
|
||||
import eu.dnetlib.jobs.SparkCreateDedupEntity;
|
||||
import eu.dnetlib.jobs.SparkCreateMergeRels;
|
||||
import eu.dnetlib.jobs.SparkCreateSimRels;
|
||||
|
@ -11,6 +12,7 @@ import eu.dnetlib.pace.util.MapDocumentUtil;
|
|||
import eu.dnetlib.pace.utils.Utility;
|
||||
import eu.dnetlib.support.ArgumentApplicationParser;
|
||||
import eu.dnetlib.support.Block;
|
||||
import eu.dnetlib.support.ConnectedComponent;
|
||||
import eu.dnetlib.support.Relation;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
@ -21,6 +23,8 @@ import org.apache.spark.api.java.JavaRDD;
|
|||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.ForeachFunction;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.graphx.Edge;
|
||||
import org.apache.spark.rdd.RDD;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
@ -179,10 +183,10 @@ public class DedupLocalTest extends DedupTestUtils {
|
|||
|
||||
//custom parameters for this test
|
||||
DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS(
|
||||
Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pub.instancetype.tree.conf.json").toURI()).toFile().getAbsolutePath()
|
||||
Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pubs.fdup.exp.json").toURI()).toFile().getAbsolutePath()
|
||||
));
|
||||
|
||||
String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.dump.1000.json").toURI()).toFile().getAbsolutePath();
|
||||
String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.to.fix.json").toURI()).toFile().getAbsolutePath();
|
||||
|
||||
String simRelsPath = workingPath + "/simrels";
|
||||
String mergeRelsPath = workingPath + "/mergerels";
|
||||
|
|
Loading…
Reference in New Issue