package eu.dnetlib.graph; import com.clearspring.analytics.util.Lists; import com.google.common.collect.Sets; import eu.dnetlib.pace.utils.Utility; import eu.dnetlib.support.ConnectedComponent; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.graphx.*; import org.apache.spark.rdd.RDD; import org.apache.spark.storage.StorageLevel; import scala.Tuple2; import scala.reflect.ClassTag; import scala.reflect.ClassTag$; import java.util.List; public class JavaGraphProcessor { // public static JavaPairRDD> findCCs(JavaPairRDD vertexes, JavaRDD> edges, int maxIterations) { ClassTag stringTag = ClassTag$.MODULE$.apply(String.class); Graph graph = Graph.apply( vertexes.rdd(), edges.rdd(), "", StorageLevel.MEMORY_ONLY(), StorageLevel.MEMORY_ONLY(), stringTag, stringTag ); GraphOps graphOps = new GraphOps<>(graph, stringTag, stringTag); JavaRDD> cc = graphOps.connectedComponents(maxIterations).vertices().toJavaRDD(); JavaPairRDD joinResult = vertexes .leftOuterJoin(cc.mapToPair(x -> x)) .mapToPair(x -> { if (!x._2()._2().isPresent()) { return new Tuple2<>(x._1(), x._2()._1()); } else { return new Tuple2<>(x._2()._2(), x._2()._1()); } }); return joinResult .groupByKey() .map(x -> Lists.newArrayList(x._2())) .zipWithUniqueId() .mapToPair(x -> new Tuple2<>("dedup______::" + x._2().toString(), x._1())); } }