2022-04-19 15:29:29 +02:00
|
|
|
package eu.dnetlib.graph;
|
|
|
|
|
2023-04-17 11:06:27 +02:00
|
|
|
import com.clearspring.analytics.util.Lists;
|
2022-04-19 15:29:29 +02:00
|
|
|
import com.google.common.collect.Sets;
|
2023-04-17 11:06:27 +02:00
|
|
|
import eu.dnetlib.pace.utils.Utility;
|
2022-04-19 15:29:29 +02:00
|
|
|
import eu.dnetlib.support.ConnectedComponent;
|
|
|
|
import org.apache.spark.api.java.JavaPairRDD;
|
|
|
|
import org.apache.spark.api.java.JavaRDD;
|
|
|
|
import org.apache.spark.graphx.*;
|
|
|
|
import org.apache.spark.rdd.RDD;
|
|
|
|
import org.apache.spark.storage.StorageLevel;
|
|
|
|
import scala.Tuple2;
|
|
|
|
import scala.reflect.ClassTag;
|
|
|
|
import scala.reflect.ClassTag$;
|
|
|
|
|
2023-04-17 11:06:27 +02:00
|
|
|
import java.util.List;
|
|
|
|
|
2022-04-19 15:29:29 +02:00
|
|
|
public class JavaGraphProcessor {
|
|
|
|
|
2023-04-17 11:06:27 +02:00
|
|
|
//<ccId, list(json)>
|
|
|
|
public static JavaPairRDD<String, List<String>> findCCs(JavaPairRDD<Object, String> vertexes, JavaRDD<Edge<String>> edges, int maxIterations) {
|
2022-04-19 15:29:29 +02:00
|
|
|
|
|
|
|
ClassTag<String> stringTag = ClassTag$.MODULE$.apply(String.class);
|
|
|
|
Graph<String, String> graph =
|
|
|
|
Graph.apply(
|
|
|
|
vertexes.rdd(),
|
|
|
|
edges.rdd(),
|
|
|
|
"",
|
|
|
|
StorageLevel.MEMORY_ONLY(),
|
|
|
|
StorageLevel.MEMORY_ONLY(),
|
|
|
|
stringTag,
|
|
|
|
stringTag
|
|
|
|
);
|
|
|
|
|
|
|
|
GraphOps<String, String> graphOps = new GraphOps<>(graph, stringTag, stringTag);
|
|
|
|
JavaRDD<Tuple2<Object, Object>> cc = graphOps.connectedComponents(maxIterations).vertices().toJavaRDD();
|
|
|
|
|
|
|
|
JavaPairRDD<Object, String> joinResult = vertexes
|
|
|
|
.leftOuterJoin(cc.mapToPair(x -> x))
|
|
|
|
.mapToPair(x -> {
|
|
|
|
if (!x._2()._2().isPresent()) {
|
|
|
|
return new Tuple2<>(x._1(), x._2()._1());
|
|
|
|
} else {
|
|
|
|
return new Tuple2<>(x._2()._2(), x._2()._1());
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
2023-04-17 11:06:27 +02:00
|
|
|
return joinResult
|
|
|
|
.groupByKey()
|
|
|
|
.map(x -> Lists.newArrayList(x._2()))
|
|
|
|
.zipWithUniqueId()
|
|
|
|
.mapToPair(x -> new Tuple2<>("dedup______::" + x._2().toString(), x._1()));
|
2022-04-19 15:29:29 +02:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|