implementation of the java version of the graph processor

This commit is contained in:
miconis 2022-04-19 15:29:29 +02:00
parent 6c47fb0e67
commit fb2eed9f0e
5 changed files with 64 additions and 17 deletions

View File

@ -1,2 +1 @@
# Thu Mar 31 12:53:27 CEST 2022 # Tue Apr 19 15:27:59 CEST 2022
projectPropertyKey=projectPropertyValue

View File

@ -1,7 +1,7 @@
package eu.dnetlib; package eu.dnetlib;
import com.google.common.hash.Hashing; import com.google.common.hash.Hashing;
import eu.dnetlib.graph.GraphProcessor; import eu.dnetlib.graph.JavaGraphProcessor;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessorForTesting; import eu.dnetlib.pace.util.BlockProcessorForTesting;
@ -19,7 +19,6 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.graphx.Edge; import org.apache.spark.graphx.Edge;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
@ -138,16 +137,15 @@ public class Deduper implements Serializable {
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s)); .mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
final RDD<Edge<String>> edgeRdd = spark final JavaRDD<Edge<String>> edgeRdd = spark
.read() .read()
.load(simRelsPath) .load(simRelsPath)
.as(Encoders.bean(Relation.class)) .as(Encoders.bean(Relation.class))
.javaRDD() .javaRDD()
.map(Relation::toEdgeRdd) .map(Relation::toEdgeRdd);
.rdd();
JavaRDD<ConnectedComponent> ccs = GraphProcessor JavaRDD<ConnectedComponent> ccs = JavaGraphProcessor
.findCCs(vertexes.rdd(), edgeRdd, maxIterations) .findCCs(vertexes, edgeRdd, maxIterations)
.toJavaRDD(); .toJavaRDD();
JavaRDD<Relation> mergeRel = ccs JavaRDD<Relation> mergeRel = ccs

View File

@ -0,0 +1,47 @@
package eu.dnetlib.graph;
import com.google.common.collect.Sets;
import eu.dnetlib.support.ConnectedComponent;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.graphx.*;
import org.apache.spark.rdd.RDD;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;
import scala.reflect.ClassTag;
import scala.reflect.ClassTag$;
public class JavaGraphProcessor {
public static RDD<ConnectedComponent> findCCs(JavaPairRDD<Object, String> vertexes, JavaRDD<Edge<String>> edges, int maxIterations) {
ClassTag<String> stringTag = ClassTag$.MODULE$.apply(String.class);
Graph<String, String> graph =
Graph.apply(
vertexes.rdd(),
edges.rdd(),
"",
StorageLevel.MEMORY_ONLY(),
StorageLevel.MEMORY_ONLY(),
stringTag,
stringTag
);
GraphOps<String, String> graphOps = new GraphOps<>(graph, stringTag, stringTag);
JavaRDD<Tuple2<Object, Object>> cc = graphOps.connectedComponents(maxIterations).vertices().toJavaRDD();
JavaPairRDD<Object, String> joinResult = vertexes
.leftOuterJoin(cc.mapToPair(x -> x))
.mapToPair(x -> {
if (!x._2()._2().isPresent()) {
return new Tuple2<>(x._1(), x._2()._1());
} else {
return new Tuple2<>(x._2()._2(), x._2()._1());
}
});
return joinResult.groupByKey().map(x -> new ConnectedComponent(Sets.newHashSet(x._2()))).rdd();
}
}

View File

@ -1,7 +1,7 @@
package eu.dnetlib.jobs; package eu.dnetlib.jobs;
import eu.dnetlib.Deduper; import eu.dnetlib.Deduper;
import eu.dnetlib.graph.GraphProcessor; import eu.dnetlib.graph.JavaGraphProcessor;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.util.MapDocumentUtil;
import eu.dnetlib.pace.utils.Utility; import eu.dnetlib.pace.utils.Utility;
@ -78,16 +78,15 @@ public class SparkCreateMergeRels extends AbstractSparkJob {
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s)); .mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
final RDD<Edge<String>> edgeRdd = spark final JavaRDD<Edge<String>> edgeRdd = spark
.read() .read()
.load(workingPath + "/simrels") .load(workingPath + "/simrels")
.as(Encoders.bean(Relation.class)) .as(Encoders.bean(Relation.class))
.javaRDD() .javaRDD()
.map(Relation::toEdgeRdd) .map(Relation::toEdgeRdd);
.rdd();
JavaRDD<ConnectedComponent> ccs = GraphProcessor JavaRDD<ConnectedComponent> ccs = JavaGraphProcessor
.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()) .findCCs(vertexes, edgeRdd, dedupConf.getWf().getMaxIterations())
.toJavaRDD(); .toJavaRDD();
JavaRDD<Relation> mergeRel = ccs JavaRDD<Relation> mergeRel = ccs

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace; package eu.dnetlib.pace;
import eu.dnetlib.Deduper; import eu.dnetlib.Deduper;
import eu.dnetlib.graph.JavaGraphProcessor;
import eu.dnetlib.jobs.SparkCreateDedupEntity; import eu.dnetlib.jobs.SparkCreateDedupEntity;
import eu.dnetlib.jobs.SparkCreateMergeRels; import eu.dnetlib.jobs.SparkCreateMergeRels;
import eu.dnetlib.jobs.SparkCreateSimRels; import eu.dnetlib.jobs.SparkCreateSimRels;
@ -11,6 +12,7 @@ import eu.dnetlib.pace.util.MapDocumentUtil;
import eu.dnetlib.pace.utils.Utility; import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.support.ArgumentApplicationParser; import eu.dnetlib.support.ArgumentApplicationParser;
import eu.dnetlib.support.Block; import eu.dnetlib.support.Block;
import eu.dnetlib.support.ConnectedComponent;
import eu.dnetlib.support.Relation; import eu.dnetlib.support.Relation;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
@ -21,6 +23,8 @@ import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.ForeachFunction; import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.graphx.Edge;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
@ -179,10 +183,10 @@ public class DedupLocalTest extends DedupTestUtils {
//custom parameters for this test //custom parameters for this test
DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS( DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS(
Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pub.instancetype.tree.conf.json").toURI()).toFile().getAbsolutePath() Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pubs.fdup.exp.json").toURI()).toFile().getAbsolutePath()
)); ));
String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.dump.1000.json").toURI()).toFile().getAbsolutePath(); String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.to.fix.json").toURI()).toFile().getAbsolutePath();
String simRelsPath = workingPath + "/simrels"; String simRelsPath = workingPath + "/simrels";
String mergeRelsPath = workingPath + "/mergerels"; String mergeRelsPath = workingPath + "/mergerels";