dnet-dedup/dnet-dedup-test/src/main/java/eu/dnetlib/SparkTest.java

package eu.dnetlib;

import eu.dnetlib.graph.GraphProcessor;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.utils.PaceUtils;
import eu.dnetlib.reporter.SparkBlockProcessor;
import eu.dnetlib.reporter.SparkReporter;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.graphx.Edge;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.LongAccumulator;
import scala.Tuple2;

import java.io.IOException;
import java.util.Map;
import java.util.stream.Collectors;

public class SparkTest {

    public static void main(String[] args) throws IOException {

        final String inputSpacePath = args[0];
        final String dedupConfigPath = args[1];
        final String groupsPath = args[2] + "_groups";
        final String outputPath = args[2] + "_output";

        final SparkSession spark = SparkSession
                .builder()
                .appName("Deduplication")
                .master("yarn")
                .getOrCreate();

        final JavaSparkContext context = new JavaSparkContext(spark.sparkContext());

        final JavaRDD<String> dataRDD = Utility.loadDataFromHDFS(inputSpacePath, context);

        final DedupConfig config = Utility.loadConfigFromHDFS(dedupConfigPath);

        Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());

        //create vertexes of the graph: <ID, MapDocument>
        JavaPairRDD<String, MapDocument> mapDocs = dataRDD.mapToPair(it -> {
            MapDocument mapDocument = PaceUtils.asMapDocument(config, it);
            return new Tuple2<>(mapDocument.getIdentifier(), mapDocument);
        });
        RDD<Tuple2<Object, MapDocument>> vertexes = mapDocs.mapToPair(t -> new Tuple2<Object, MapDocument>( (long) t._1().hashCode(), t._2())).rdd();

        //group documents basing on clustering
        JavaPairRDD<String, Iterable<MapDocument>> blocks = mapDocs.reduceByKey((a, b) -> a)    //the reduce is just to be sure that we haven't document with same id
                //Clustering: from <id, doc> to List<groupkey,doc>
                .flatMapToPair(a -> {
                    final MapDocument currentDocument = a._2();

                    return Utility.getGroupingKeys(config, currentDocument).stream()
                            .map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator();
                }).groupByKey();

        Utility.deleteIfExists(groupsPath);
        blocks.map(group -> new DocumentsBlock(group._1(), group._2())).saveAsTextFile(groupsPath);

        //create relations by comparing only elements in the same group
        final JavaPairRDD<String, String> relationRDD = blocks.flatMapToPair(it -> {
            final SparkReporter reporter = new SparkReporter();
            new SparkBlockProcessor(config).process(it._1(), it._2(), reporter, accumulators);
            return reporter.getReport().iterator();
        });

        final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new Edge<>(it._1().hashCode(),it._2().hashCode(), "similarTo")).rdd();

        JavaRDD<ConnectedComponent> ccs = GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD();

        //save connected components on textfile
        Utility.deleteIfExists(outputPath);
        ccs.saveAsTextFile(outputPath);

        final JavaRDD<ConnectedComponent> connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1);
        final JavaRDD<ConnectedComponent> nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1);

        System.out.println("Non duplicates: " + nonDeduplicated.count());
        System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());
        System.out.println("Connected Components: " + connectedComponents.count());
        accumulators.forEach((name, acc) -> System.out.println(name + " -> " + acc.value()));

    }

}
Added First Implementation of Spark Test 2018-10-02 17:07:17 +02:00			`package eu.dnetlib;`

Added FSpark Implementation of dedup 2018-10-11 15:19:20 +02:00			`import eu.dnetlib.graph.GraphProcessor;`
Added First Implementation of Spark Test 2018-10-02 17:07:17 +02:00			`import eu.dnetlib.pace.config.DedupConfig;`
			`import eu.dnetlib.pace.model.MapDocument;`
			`import eu.dnetlib.pace.utils.PaceUtils;`
implementation of the integration test, addition of document blocks to group entities after clustering 2019-05-21 16:38:26 +02:00			`import eu.dnetlib.reporter.SparkBlockProcessor;`
Added FSpark Implementation of dedup 2018-10-11 15:19:20 +02:00			`import eu.dnetlib.reporter.SparkReporter;`
			`import org.apache.spark.api.java.JavaPairRDD;`
Added First Implementation of Spark Test 2018-10-02 17:07:17 +02:00			`import org.apache.spark.api.java.JavaRDD;`
			`import org.apache.spark.api.java.JavaSparkContext;`
Added FSpark Implementation of dedup 2018-10-11 15:19:20 +02:00			`import org.apache.spark.graphx.Edge;`
			`import org.apache.spark.rdd.RDD;`
addition of a sparktester test, implementation of 2 different classes for testing in dnet-dedup-test module, addition of new terms in the vocabulary and change in the implementation of the JaroWinklerNormalizedName comparator 2019-04-03 09:40:14 +02:00			`import org.apache.spark.sql.SparkSession;`
implementation of the integration test, addition of document blocks to group entities after clustering 2019-05-21 16:38:26 +02:00			`import org.apache.spark.util.LongAccumulator;`
Added First Implementation of Spark Test 2018-10-02 17:07:17 +02:00			`import scala.Tuple2;`

			`import java.io.IOException;`
implementation of the integration test, addition of document blocks to group entities after clustering 2019-05-21 16:38:26 +02:00			`import java.util.Map;`
Added First Implementation of Spark Test 2018-10-02 17:07:17 +02:00			`import java.util.stream.Collectors;`

			`public class SparkTest {`

addition of a sparktester test, implementation of 2 different classes for testing in dnet-dedup-test module, addition of new terms in the vocabulary and change in the implementation of the JaroWinklerNormalizedName comparator 2019-04-03 09:40:14 +02:00			`public static void main(String[] args) throws IOException {`
added DiffPatchMatch utility. Resumed commented tests! 2018-10-31 10:49:11 +01:00
implementation of the integration test, addition of document blocks to group entities after clustering 2019-05-21 16:38:26 +02:00			`final String inputSpacePath = args[0];`
			`final String dedupConfigPath = args[1];`
			`final String groupsPath = args[2] + "_groups";`
			`final String outputPath = args[2] + "_output";`

addition of a sparktester test, implementation of 2 different classes for testing in dnet-dedup-test module, addition of new terms in the vocabulary and change in the implementation of the JaroWinklerNormalizedName comparator 2019-04-03 09:40:14 +02:00			`final SparkSession spark = SparkSession`
			`.builder()`
			`.appName("Deduplication")`
			`.master("yarn")`
			`.getOrCreate();`

			`final JavaSparkContext context = new JavaSparkContext(spark.sparkContext());`

implementation of the integration test, addition of document blocks to group entities after clustering 2019-05-21 16:38:26 +02:00			`final JavaRDD<String> dataRDD = Utility.loadDataFromHDFS(inputSpacePath, context);`
Added FSpark Implementation of dedup 2018-10-11 15:19:20 +02:00
implementation of the integration test, addition of document blocks to group entities after clustering 2019-05-21 16:38:26 +02:00			`final DedupConfig config = Utility.loadConfigFromHDFS(dedupConfigPath);`
Added FSpark Implementation of dedup 2018-10-11 15:19:20 +02:00
implementation of the integration test, addition of document blocks to group entities after clustering 2019-05-21 16:38:26 +02:00			`Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());`
Added First Implementation of Spark Test 2018-10-02 17:07:17 +02:00
update of the spark test 2018-10-18 10:12:44 +02:00			`//create vertexes of the graph: <ID, MapDocument>`
Added FSpark Implementation of dedup 2018-10-11 15:19:20 +02:00			`JavaPairRDD<String, MapDocument> mapDocs = dataRDD.mapToPair(it -> {`
Added First Implementation of Spark Test 2018-10-02 17:07:17 +02:00			`MapDocument mapDocument = PaceUtils.asMapDocument(config, it);`
			`return new Tuple2<>(mapDocument.getIdentifier(), mapDocument);`
Added FSpark Implementation of dedup 2018-10-11 15:19:20 +02:00			`});`
update of the spark test 2018-10-18 10:12:44 +02:00			`RDD<Tuple2<Object, MapDocument>> vertexes = mapDocs.mapToPair(t -> new Tuple2<Object, MapDocument>( (long) t._1().hashCode(), t._2())).rdd();`
Added FSpark Implementation of dedup 2018-10-11 15:19:20 +02:00
implementation of the integration test, addition of document blocks to group entities after clustering 2019-05-21 16:38:26 +02:00			`//group documents basing on clustering`
update of the comparator for legalnames of organizations 2019-03-21 14:27:27 +01:00			`JavaPairRDD<String, Iterable<MapDocument>> blocks = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id`
update in the discovery of clustering, conditions and distance functions (annotated with custom annotations) 2018-10-24 12:09:41 +02:00			`//Clustering: from <id, doc> to List<groupkey,doc>`
Added FSpark Implementation of dedup 2018-10-11 15:19:20 +02:00			`.flatMapToPair(a -> {`
			`final MapDocument currentDocument = a._2();`
implementation of the test classes and minor changes 2019-02-08 12:56:47 +01:00
addition of a sparktester test, implementation of 2 different classes for testing in dnet-dedup-test module, addition of new terms in the vocabulary and change in the implementation of the JaroWinklerNormalizedName comparator 2019-04-03 09:40:14 +02:00			`return Utility.getGroupingKeys(config, currentDocument).stream()`
Added FSpark Implementation of dedup 2018-10-11 15:19:20 +02:00			`.map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator();`
implementation of the integration test, addition of document blocks to group entities after clustering 2019-05-21 16:38:26 +02:00			`}).groupByKey();`

			`Utility.deleteIfExists(groupsPath);`
			`blocks.map(group -> new DocumentsBlock(group._1(), group._2())).saveAsTextFile(groupsPath);`
update of the comparator for legalnames of organizations 2019-03-21 14:27:27 +01:00
			`//create relations by comparing only elements in the same group`
			`final JavaPairRDD<String, String> relationRDD = blocks.flatMapToPair(it -> {`
implementation of the integration test, addition of document blocks to group entities after clustering 2019-05-21 16:38:26 +02:00			`final SparkReporter reporter = new SparkReporter();`
			`new SparkBlockProcessor(config).process(it._1(), it._2(), reporter, accumulators);`
			`return reporter.getReport().iterator();`
			`});`
Added FSpark Implementation of dedup 2018-10-11 15:19:20 +02:00
update of the spark test 2018-10-18 10:12:44 +02:00			`final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new Edge<>(it._1().hashCode(),it._2().hashCode(), "similarTo")).rdd();`
Added FSpark Implementation of dedup 2018-10-11 15:19:20 +02:00
update of the spark test 2018-10-18 10:12:44 +02:00			`JavaRDD<ConnectedComponent> ccs = GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD();`
Added FSpark Implementation of dedup 2018-10-11 15:19:20 +02:00
implementation of the integration test, addition of document blocks to group entities after clustering 2019-05-21 16:38:26 +02:00			`//save connected components on textfile`
			`Utility.deleteIfExists(outputPath);`
			`ccs.saveAsTextFile(outputPath);`

update of the spark test 2018-10-18 10:12:44 +02:00			`final JavaRDD<ConnectedComponent> connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1);`
			`final JavaRDD<ConnectedComponent> nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1);`
Added FSpark Implementation of dedup 2018-10-11 15:19:20 +02:00
update of the spark test 2018-10-18 10:12:44 +02:00			`System.out.println("Non duplicates: " + nonDeduplicated.count());`
			`System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());`
			`System.out.println("Connected Components: " + connectedComponents.count());`
implementation of the integration test, addition of document blocks to group entities after clustering 2019-05-21 16:38:26 +02:00			`accumulators.forEach((name, acc) -> System.out.println(name + " -> " + acc.value()));`
Added First Implementation of Spark Test 2018-10-02 17:07:17 +02:00
			`}`

implementation of the test classes and minor changes 2019-02-08 12:56:47 +01:00			`}`