dnet-hadoop/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java


package eu.dnetlib.dhp.oa.dedup;

import java.util.Map;
import java.util.stream.Collectors;

import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.util.LongAccumulator;

import eu.dnetlib.dhp.oa.dedup.model.Block;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessor;
import scala.Serializable;
import scala.Tuple2;

public class Deduper implements Serializable {

	public static JavaPairRDD<String, String> computeRelations(
		JavaSparkContext context, JavaPairRDD<String, Block> blocks, DedupConfig config) {
		Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());

		return blocks
			.flatMapToPair(
				it -> {
					final SparkReporter reporter = new SparkReporter(accumulators);
					new BlockProcessor(config)
						.processSortedBlock(it._1(), it._2().getDocuments(), reporter);
					return reporter.getRelations().iterator();
				})
			.mapToPair(it -> new Tuple2<>(it._1() + it._2(), it))
			.reduceByKey((a, b) -> a)
			.mapToPair(Tuple2::_2);
	}

	public static JavaPairRDD<String, Block> createSortedBlocks(
		JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
		final String of = config.getWf().getOrderField();
		final int maxQueueSize = config.getWf().getQueueMaxSize();

		return mapDocs
			// the reduce is just to be sure that we haven't document with same id
			.reduceByKey((a, b) -> a)
			.map(Tuple2::_2)
			// Clustering: from <id, doc> to List<groupkey,doc>
			.flatMap(
				a -> DedupUtility
					.getGroupingKeys(config, a)
					.stream()
					.map(it -> Block.from(it, a))
					.collect(Collectors.toList())
					.iterator())
			.mapToPair(block -> new Tuple2<>(block.getKey(), block))
			.reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize))
			.filter(b -> b._2().getDocuments().size() > 1);
	}
}
switched automatic code formatting plugin to net.revelc.code.formatter:formatter-maven-plugin 2020-04-27 14:52:31 +02:00
moved openaire specific implementations under dedicated package eu.dnetlib.dhp.oa 2020-03-27 10:42:17 +01:00			`package eu.dnetlib.dhp.oa.dedup;`
Implemented deduplication on spark 2019-12-06 13:38:00 +01:00
introduced common project code formatting plugin, works on the commit hook, based on https://github.com/Cosium/git-code-format-maven-plugin, applied to each java class in the project 2020-04-18 12:42:58 +02:00			`import java.util.Map;`
			`import java.util.stream.Collectors;`
reformatted code according to the updated style descriptor 2020-04-28 11:23:29 +02:00
Implemented deduplication on spark 2019-12-06 13:38:00 +01:00			`import org.apache.spark.api.java.JavaPairRDD;`
			`import org.apache.spark.api.java.JavaSparkContext;`
			`import org.apache.spark.util.LongAccumulator;`
reformatted code according to the updated style descriptor 2020-04-28 11:23:29 +02:00
			`import eu.dnetlib.dhp.oa.dedup.model.Block;`
			`import eu.dnetlib.pace.config.DedupConfig;`
			`import eu.dnetlib.pace.model.MapDocument;`
			`import eu.dnetlib.pace.util.BlockProcessor;`
Implemented deduplication on spark 2019-12-06 13:38:00 +01:00			`import scala.Serializable;`
			`import scala.Tuple2;`

			`public class Deduper implements Serializable {`

switched automatic code formatting plugin to net.revelc.code.formatter:formatter-maven-plugin 2020-04-27 14:52:31 +02:00			`public static JavaPairRDD<String, String> computeRelations(`
			`JavaSparkContext context, JavaPairRDD<String, Block> blocks, DedupConfig config) {`
			`Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());`
introduced common project code formatting plugin, works on the commit hook, based on https://github.com/Cosium/git-code-format-maven-plugin, applied to each java class in the project 2020-04-18 12:42:58 +02:00
switched automatic code formatting plugin to net.revelc.code.formatter:formatter-maven-plugin 2020-04-27 14:52:31 +02:00			`return blocks`
			`.flatMapToPair(`
			`it -> {`
			`final SparkReporter reporter = new SparkReporter(accumulators);`
			`new BlockProcessor(config)`
			`.processSortedBlock(it._1(), it._2().getDocuments(), reporter);`
			`return reporter.getRelations().iterator();`
			`})`
			`.mapToPair(it -> new Tuple2<>(it._1() + it._2(), it))`
			`.reduceByKey((a, b) -> a)`
			`.mapToPair(Tuple2::_2);`
			`}`
Implemented deduplication on spark 2019-12-06 13:38:00 +01:00
switched automatic code formatting plugin to net.revelc.code.formatter:formatter-maven-plugin 2020-04-27 14:52:31 +02:00			`public static JavaPairRDD<String, Block> createSortedBlocks(`
			`JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {`
			`final String of = config.getWf().getOrderField();`
fix: deduper must use queueMaxSize instead of groupMaxSize for the block definition 2020-07-02 12:43:51 +02:00			`final int maxQueueSize = config.getWf().getQueueMaxSize();`
various refactorings on the dnet-dedup-openaire workflow 2020-04-18 12:06:23 +02:00
switched automatic code formatting plugin to net.revelc.code.formatter:formatter-maven-plugin 2020-04-27 14:52:31 +02:00			`return mapDocs`
			`// the reduce is just to be sure that we haven't document with same id`
			`.reduceByKey((a, b) -> a)`
			`.map(Tuple2::_2)`
			`// Clustering: from <id, doc> to List<groupkey,doc>`
			`.flatMap(`
			`a -> DedupUtility`
			`.getGroupingKeys(config, a)`
			`.stream()`
			`.map(it -> Block.from(it, a))`
			`.collect(Collectors.toList())`
			`.iterator())`
			`.mapToPair(block -> new Tuple2<>(block.getKey(), block))`
fix: filter the blocks with size = 1 2020-07-16 10:11:32 +02:00			`.reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize))`
			`.filter(b -> b._2().getDocuments().size() > 1);`
switched automatic code formatting plugin to net.revelc.code.formatter:formatter-maven-plugin 2020-04-27 14:52:31 +02:00			`}`
introduced common project code formatting plugin, works on the commit hook, based on https://github.com/Cosium/git-code-format-maven-plugin, applied to each java class in the project 2020-04-18 12:42:58 +02:00			`}`