2020-04-27 14:52:31 +02:00
|
|
|
|
2020-03-27 13:48:44 +01:00
|
|
|
package eu.dnetlib.dedup;
|
|
|
|
|
|
|
|
import eu.dnetlib.pace.config.DedupConfig;
|
|
|
|
import eu.dnetlib.pace.model.MapDocument;
|
|
|
|
import eu.dnetlib.pace.util.BlockProcessor;
|
|
|
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
2020-04-18 12:42:58 +02:00
|
|
|
import java.util.*;
|
|
|
|
import java.util.stream.Collectors;
|
2020-03-27 13:48:44 +01:00
|
|
|
import org.apache.commons.logging.Log;
|
|
|
|
import org.apache.commons.logging.LogFactory;
|
|
|
|
import org.apache.spark.api.java.JavaPairRDD;
|
|
|
|
import org.apache.spark.api.java.JavaRDD;
|
|
|
|
import org.apache.spark.api.java.JavaSparkContext;
|
|
|
|
import org.apache.spark.api.java.function.Function2;
|
|
|
|
import org.apache.spark.api.java.function.PairFlatMapFunction;
|
|
|
|
import org.apache.spark.api.java.function.PairFunction;
|
|
|
|
import org.apache.spark.util.LongAccumulator;
|
|
|
|
import scala.Serializable;
|
|
|
|
import scala.Tuple2;
|
|
|
|
|
|
|
|
public class Deduper implements Serializable {
|
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
private static final Log log = LogFactory.getLog(Deduper.class);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @return the list of relations generated by the deduplication
|
|
|
|
* @param: the spark context
|
|
|
|
* @param: list of JSON entities to be deduped
|
|
|
|
* @param: the dedup configuration
|
|
|
|
*/
|
|
|
|
public static JavaPairRDD<String, String> dedup(
|
|
|
|
JavaSparkContext context, JavaRDD<String> entities, DedupConfig config) {
|
|
|
|
|
|
|
|
Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
|
|
|
|
|
|
|
|
// create vertexes of the graph: <ID, MapDocument>
|
|
|
|
JavaPairRDD<String, MapDocument> mapDocs = mapToVertexes(context, entities, config);
|
|
|
|
|
|
|
|
// create blocks for deduplication
|
|
|
|
JavaPairRDD<String, Iterable<MapDocument>> blocks = createBlocks(context, mapDocs, config);
|
|
|
|
|
|
|
|
// create relations by comparing only elements in the same group
|
|
|
|
return computeRelations(context, blocks, config);
|
|
|
|
|
|
|
|
// final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new
|
|
|
|
// Edge<>(it._1().hashCode(),
|
|
|
|
// it._2().hashCode(), "equalTo")).rdd();
|
|
|
|
//
|
|
|
|
// RDD<Tuple2<Object, MapDocument>> vertexes =
|
|
|
|
// mapDocs.mapToPair((PairFunction<Tuple2<String, MapDocument>, Object, MapDocument>) t ->
|
|
|
|
// new
|
|
|
|
// Tuple2<Object, MapDocument>((long) t._1().hashCode(), t._2())).rdd();
|
|
|
|
// accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value()));
|
|
|
|
//
|
|
|
|
// return GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @return the list of relations generated by the deduplication
|
|
|
|
* @param: the spark context
|
|
|
|
* @param: list of blocks
|
|
|
|
* @param: the dedup configuration
|
|
|
|
*/
|
|
|
|
public static JavaPairRDD<String, String> computeRelations(
|
|
|
|
JavaSparkContext context,
|
|
|
|
JavaPairRDD<String, Iterable<MapDocument>> blocks,
|
|
|
|
DedupConfig config) {
|
|
|
|
|
|
|
|
Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
|
|
|
|
|
|
|
|
return blocks
|
|
|
|
.flatMapToPair(
|
|
|
|
(PairFlatMapFunction<Tuple2<String, Iterable<MapDocument>>, String, String>) it -> {
|
|
|
|
final SparkReporter reporter = new SparkReporter(accumulators);
|
|
|
|
new BlockProcessor(config).process(it._1(), it._2(), reporter);
|
|
|
|
return reporter.getRelations().iterator();
|
|
|
|
})
|
|
|
|
.mapToPair(
|
|
|
|
(PairFunction<Tuple2<String, String>, String, Tuple2<String, String>>) item -> new Tuple2<String, Tuple2<String, String>>(
|
|
|
|
item._1() + item._2(), item))
|
|
|
|
.reduceByKey((a, b) -> a)
|
|
|
|
.mapToPair(
|
|
|
|
(PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @return the list of blocks based on clustering of dedup configuration
|
|
|
|
* @param: the spark context
|
|
|
|
* @param: list of entities: <id, entity>
|
|
|
|
* @param: the dedup configuration
|
|
|
|
*/
|
|
|
|
public static JavaPairRDD<String, Iterable<MapDocument>> createBlocks(
|
|
|
|
JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
|
|
|
|
return mapDocs
|
|
|
|
// the reduce is just to be sure that we haven't document with same id
|
|
|
|
.reduceByKey((a, b) -> a)
|
|
|
|
.map(Tuple2::_2)
|
|
|
|
// Clustering: from <id, doc> to List<groupkey,doc>
|
|
|
|
.flatMapToPair(
|
|
|
|
(PairFlatMapFunction<MapDocument, String, MapDocument>) a -> DedupUtility
|
|
|
|
.getGroupingKeys(config, a)
|
|
|
|
.stream()
|
|
|
|
.map(it -> new Tuple2<>(it, a))
|
|
|
|
.collect(Collectors.toList())
|
|
|
|
.iterator())
|
|
|
|
.groupByKey();
|
|
|
|
}
|
|
|
|
|
|
|
|
public static JavaPairRDD<String, List<MapDocument>> createsortedBlocks(
|
|
|
|
JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
|
|
|
|
final String of = config.getWf().getOrderField();
|
|
|
|
final int maxQueueSize = config.getWf().getGroupMaxSize();
|
|
|
|
return mapDocs
|
|
|
|
// the reduce is just to be sure that we haven't document with same id
|
|
|
|
.reduceByKey((a, b) -> a)
|
|
|
|
.map(Tuple2::_2)
|
|
|
|
// Clustering: from <id, doc> to List<groupkey,doc>
|
|
|
|
.flatMapToPair(
|
|
|
|
(PairFlatMapFunction<MapDocument, String, List<MapDocument>>) a -> DedupUtility
|
|
|
|
.getGroupingKeys(config, a)
|
|
|
|
.stream()
|
|
|
|
.map(
|
|
|
|
it -> {
|
|
|
|
List<MapDocument> tmp = new ArrayList<>();
|
|
|
|
tmp.add(a);
|
|
|
|
return new Tuple2<>(it, tmp);
|
|
|
|
})
|
|
|
|
.collect(Collectors.toList())
|
|
|
|
.iterator())
|
|
|
|
.reduceByKey(
|
|
|
|
(Function2<List<MapDocument>, List<MapDocument>, List<MapDocument>>) (v1, v2) -> {
|
|
|
|
v1.addAll(v2);
|
|
|
|
v1.sort(Comparator.comparing(a -> a.getFieldMap().get(of).stringValue()));
|
|
|
|
if (v1.size() > maxQueueSize)
|
|
|
|
return new ArrayList<>(v1.subList(0, maxQueueSize));
|
|
|
|
return v1;
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @return the list of vertexes: <id, mapDocument>
|
|
|
|
* @param: the spark context
|
|
|
|
* @param: list of JSON entities
|
|
|
|
* @param: the dedup configuration
|
|
|
|
*/
|
|
|
|
public static JavaPairRDD<String, MapDocument> mapToVertexes(
|
|
|
|
JavaSparkContext context, JavaRDD<String> entities, DedupConfig config) {
|
|
|
|
|
|
|
|
return entities
|
|
|
|
.mapToPair(
|
|
|
|
(PairFunction<String, String, MapDocument>) s -> {
|
|
|
|
MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, s);
|
|
|
|
return new Tuple2<String, MapDocument>(mapDocument.getIdentifier(), mapDocument);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
public static JavaPairRDD<String, String> computeRelations2(
|
|
|
|
JavaSparkContext context, JavaPairRDD<String, List<MapDocument>> blocks, DedupConfig config) {
|
|
|
|
Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
|
|
|
|
|
|
|
|
return blocks
|
|
|
|
.flatMapToPair(
|
|
|
|
(PairFlatMapFunction<Tuple2<String, List<MapDocument>>, String, String>) it -> {
|
|
|
|
try {
|
|
|
|
final SparkReporter reporter = new SparkReporter(accumulators);
|
|
|
|
new BlockProcessor(config).processSortedBlock(it._1(), it._2(), reporter);
|
|
|
|
return reporter.getRelations().iterator();
|
|
|
|
} catch (Exception e) {
|
|
|
|
throw new RuntimeException(it._2().get(0).getIdentifier(), e);
|
|
|
|
}
|
|
|
|
})
|
|
|
|
.mapToPair(
|
|
|
|
(PairFunction<Tuple2<String, String>, String, Tuple2<String, String>>) item -> new Tuple2<String, Tuple2<String, String>>(
|
|
|
|
item._1() + item._2(), item))
|
|
|
|
.reduceByKey((a, b) -> a)
|
|
|
|
.mapToPair(
|
|
|
|
(PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
|
|
|
|
}
|
2020-04-18 12:42:58 +02:00
|
|
|
}
|