2018-10-02 17:07:17 +02:00
|
|
|
package eu.dnetlib;
|
|
|
|
|
2018-10-11 15:19:20 +02:00
|
|
|
import eu.dnetlib.graph.GraphProcessor;
|
2018-10-02 17:07:17 +02:00
|
|
|
import eu.dnetlib.pace.config.DedupConfig;
|
|
|
|
import eu.dnetlib.pace.model.MapDocument;
|
|
|
|
import eu.dnetlib.pace.utils.PaceUtils;
|
2019-05-21 16:38:26 +02:00
|
|
|
import eu.dnetlib.reporter.SparkBlockProcessor;
|
2018-10-11 15:19:20 +02:00
|
|
|
import eu.dnetlib.reporter.SparkReporter;
|
|
|
|
import org.apache.spark.api.java.JavaPairRDD;
|
2018-10-02 17:07:17 +02:00
|
|
|
import org.apache.spark.api.java.JavaRDD;
|
|
|
|
import org.apache.spark.api.java.JavaSparkContext;
|
2018-10-11 15:19:20 +02:00
|
|
|
import org.apache.spark.graphx.Edge;
|
|
|
|
import org.apache.spark.rdd.RDD;
|
2019-04-03 09:40:14 +02:00
|
|
|
import org.apache.spark.sql.SparkSession;
|
2019-05-21 16:38:26 +02:00
|
|
|
import org.apache.spark.util.LongAccumulator;
|
2018-10-02 17:07:17 +02:00
|
|
|
import scala.Tuple2;
|
|
|
|
|
|
|
|
import java.io.IOException;
|
2019-05-21 16:38:26 +02:00
|
|
|
import java.util.Map;
|
2018-10-02 17:07:17 +02:00
|
|
|
import java.util.stream.Collectors;
|
|
|
|
|
|
|
|
public class SparkTest {
|
|
|
|
|
2019-04-03 09:40:14 +02:00
|
|
|
public static void main(String[] args) throws IOException {
|
2018-10-31 10:49:11 +01:00
|
|
|
|
2019-05-21 16:38:26 +02:00
|
|
|
final String inputSpacePath = args[0];
|
|
|
|
final String dedupConfigPath = args[1];
|
|
|
|
final String groupsPath = args[2] + "_groups";
|
|
|
|
final String outputPath = args[2] + "_output";
|
|
|
|
|
2019-04-03 09:40:14 +02:00
|
|
|
final SparkSession spark = SparkSession
|
|
|
|
.builder()
|
|
|
|
.appName("Deduplication")
|
|
|
|
.master("yarn")
|
|
|
|
.getOrCreate();
|
|
|
|
|
|
|
|
final JavaSparkContext context = new JavaSparkContext(spark.sparkContext());
|
|
|
|
|
2019-05-21 16:38:26 +02:00
|
|
|
final JavaRDD<String> dataRDD = Utility.loadDataFromHDFS(inputSpacePath, context);
|
2018-10-11 15:19:20 +02:00
|
|
|
|
2019-05-21 16:38:26 +02:00
|
|
|
final DedupConfig config = Utility.loadConfigFromHDFS(dedupConfigPath);
|
2018-10-11 15:19:20 +02:00
|
|
|
|
2019-05-21 16:38:26 +02:00
|
|
|
Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());
|
2018-10-02 17:07:17 +02:00
|
|
|
|
2018-10-18 10:12:44 +02:00
|
|
|
//create vertexes of the graph: <ID, MapDocument>
|
2018-10-11 15:19:20 +02:00
|
|
|
JavaPairRDD<String, MapDocument> mapDocs = dataRDD.mapToPair(it -> {
|
2018-10-02 17:07:17 +02:00
|
|
|
MapDocument mapDocument = PaceUtils.asMapDocument(config, it);
|
|
|
|
return new Tuple2<>(mapDocument.getIdentifier(), mapDocument);
|
2018-10-11 15:19:20 +02:00
|
|
|
});
|
2018-10-18 10:12:44 +02:00
|
|
|
RDD<Tuple2<Object, MapDocument>> vertexes = mapDocs.mapToPair(t -> new Tuple2<Object, MapDocument>( (long) t._1().hashCode(), t._2())).rdd();
|
2018-10-11 15:19:20 +02:00
|
|
|
|
2019-05-21 16:38:26 +02:00
|
|
|
//group documents basing on clustering
|
2019-03-21 14:27:27 +01:00
|
|
|
JavaPairRDD<String, Iterable<MapDocument>> blocks = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id
|
2018-10-24 12:09:41 +02:00
|
|
|
//Clustering: from <id, doc> to List<groupkey,doc>
|
2018-10-11 15:19:20 +02:00
|
|
|
.flatMapToPair(a -> {
|
|
|
|
final MapDocument currentDocument = a._2();
|
2019-02-08 12:56:47 +01:00
|
|
|
|
2019-04-03 09:40:14 +02:00
|
|
|
return Utility.getGroupingKeys(config, currentDocument).stream()
|
2018-10-11 15:19:20 +02:00
|
|
|
.map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator();
|
2019-05-21 16:38:26 +02:00
|
|
|
}).groupByKey();
|
|
|
|
|
|
|
|
Utility.deleteIfExists(groupsPath);
|
|
|
|
blocks.map(group -> new DocumentsBlock(group._1(), group._2())).saveAsTextFile(groupsPath);
|
2019-03-21 14:27:27 +01:00
|
|
|
|
|
|
|
//create relations by comparing only elements in the same group
|
|
|
|
final JavaPairRDD<String, String> relationRDD = blocks.flatMapToPair(it -> {
|
2019-05-21 16:38:26 +02:00
|
|
|
final SparkReporter reporter = new SparkReporter();
|
|
|
|
new SparkBlockProcessor(config).process(it._1(), it._2(), reporter, accumulators);
|
|
|
|
return reporter.getReport().iterator();
|
|
|
|
});
|
2018-10-11 15:19:20 +02:00
|
|
|
|
2018-10-18 10:12:44 +02:00
|
|
|
final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new Edge<>(it._1().hashCode(),it._2().hashCode(), "similarTo")).rdd();
|
2018-10-11 15:19:20 +02:00
|
|
|
|
2018-10-18 10:12:44 +02:00
|
|
|
JavaRDD<ConnectedComponent> ccs = GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD();
|
2018-10-11 15:19:20 +02:00
|
|
|
|
2019-05-21 16:38:26 +02:00
|
|
|
//save connected components on textfile
|
|
|
|
Utility.deleteIfExists(outputPath);
|
|
|
|
ccs.saveAsTextFile(outputPath);
|
|
|
|
|
2018-10-18 10:12:44 +02:00
|
|
|
final JavaRDD<ConnectedComponent> connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1);
|
|
|
|
final JavaRDD<ConnectedComponent> nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1);
|
2018-10-11 15:19:20 +02:00
|
|
|
|
2018-10-18 10:12:44 +02:00
|
|
|
System.out.println("Non duplicates: " + nonDeduplicated.count());
|
|
|
|
System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());
|
|
|
|
System.out.println("Connected Components: " + connectedComponents.count());
|
2019-05-21 16:38:26 +02:00
|
|
|
accumulators.forEach((name, acc) -> System.out.println(name + " -> " + acc.value()));
|
2018-10-02 17:07:17 +02:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2019-02-08 12:56:47 +01:00
|
|
|
}
|