implementation of the test for the dedup and addition of new support classes

2020-06-11 10:46:46 +02:00 · 2020-06-11 10:46:46 +02:00 · b3ec4194da
parent aa4d03cfa3
commit b3ec4194da
12 changed files with 394 additions and 301 deletions
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java
@ -1,5 +1,6 @@
 package eu.dnetlib;

+import com.google.common.hash.Hashing;
 import eu.dnetlib.graph.GraphProcessor;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.model.MapDocument;
@ -7,105 +8,182 @@ import eu.dnetlib.pace.util.BlockProcessor;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 import eu.dnetlib.pace.utils.Utility;
 import eu.dnetlib.reporter.SparkReporter;
+import eu.dnetlib.support.Block;
 import eu.dnetlib.support.ConnectedComponent;
+import eu.dnetlib.support.Relation;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.Optional;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.graphx.Edge;
 import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
 import org.apache.spark.util.LongAccumulator;
 import scala.Serializable;
 import scala.Tuple2;

-import java.util.Map;
+import java.util.*;
 import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;

 public class Deduper implements Serializable {

    private static final Log log = LogFactory.getLog(Deduper.class);

-    /**
-     * @param: the spark context
-     * @param: list of JSON entities to be deduped
-     * @param: the dedup configuration
-     *
-     * @return the list of connected components generated by the deduplication
-     */
-    public static JavaRDD<ConnectedComponent> dedup(JavaSparkContext context, JavaRDD<String> entities, DedupConfig config){
+    public static JavaPairRDD<String, Block> createSortedBlocks(
+            JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
+        final String of = config.getWf().getOrderField();
+        final int maxQueueSize = config.getWf().getGroupMaxSize();

+        return mapDocs
+                // the reduce is just to be sure that we haven't document with same id
+                .reduceByKey((a, b) -> a)
+                .map(Tuple2::_2)
+                // Clustering: from <id, doc> to List<groupkey,doc>
+                .flatMap(
+                        a -> Utility
+                                .getGroupingKeys(config, a)
+                                .stream()
+                                .map(it -> Block.from(it, a))
+                                .collect(Collectors.toList())
+                                .iterator())
+                .mapToPair(block -> new Tuple2<>(block.getKey(), block))
+                .reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize));
+    }
+
+    public static Iterator<Tuple2<String, String>> ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf) {
+        return cc
+                .getDocs()
+                .stream()
+                .flatMap(
+                        id -> {
+                            List<Tuple2<String, String>> tmp = new ArrayList<>();
+                            tmp.add(new Tuple2<>(cc.getCcId(), id));
+                            return tmp.stream();
+                        })
+                .iterator();
+    }
+
+    public static long hash(final String id) {
+        return Hashing.murmur3_128().hashString(id).asLong();
+    }
+
+    public static ConnectedComponent entityMerger(String key, Iterator<String> values) {
+
+        ConnectedComponent cc = new ConnectedComponent();
+        cc.setCcId(key);
+        cc.setDocs(StreamSupport.stream(Spliterators.spliteratorUnknownSize(values, Spliterator.ORDERED), false)
+                .collect(Collectors.toSet()));
+        return cc;
+    }
+
+    public static JavaRDD<Relation> computeRelations(
+            JavaSparkContext context, JavaPairRDD<String, Block> blocks, DedupConfig config) {
        Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());

-        //create vertexes of the graph: <ID, MapDocument>
-        JavaPairRDD<String, MapDocument> mapDocs = mapToVertexes(context, entities, config);
-        RDD<Tuple2<Object, MapDocument>> vertexes = mapDocs.mapToPair(t -> new Tuple2<Object, MapDocument>(Utility.getHashcode(t._1()), t._2())).rdd();
-
-        //create blocks for deduplication
-        JavaPairRDD<String, Iterable<MapDocument>> blocks = createBlocks(context, mapDocs, config);
-
-        //create relations by comparing only elements in the same group
-        final JavaPairRDD<String, String> relationRDD = computeRelations(context, blocks, config);
-
-        System.out.println("Number of relations = " + relationRDD.distinct().count());
-
-        final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new Edge<>(Utility.getHashcode(it._1()),Utility.getHashcode(it._2()), "isSimilarTo")).rdd();
-
-        accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value()));
-
-        return GraphProcessor.findCCs(vertexes, edgeRdd, config.getWf().getMaxIterations()).toJavaRDD();
+        return blocks
+                .flatMapToPair(
+                        it -> {
+                            final SparkReporter reporter = new SparkReporter(accumulators);
+                            new BlockProcessor(config)
+                                    .processSortedBlock(it._1(), it._2().getDocuments(), reporter);
+                            return reporter.getRelations().iterator();
+                        })
+                .mapToPair(it -> new Tuple2<>(it._1() + it._2(), new Relation(it._1(), it._2(), "simRel")))
+                .reduceByKey((a, b) -> a)
+                .map(Tuple2::_2);
    }

-    /**
-     * @param: the spark context
-     * @param: list of blocks
-     * @param: the dedup configuration
-     *
-     * @return the list of relations generated by the deduplication
-     */
-    public static JavaPairRDD<String, String> computeRelations(JavaSparkContext context, JavaPairRDD<String, Iterable<MapDocument>> blocks, DedupConfig config) {
+    public static void createSimRels(DedupConfig dedupConf, SparkSession spark, String entitiesPath, String simRelsPath){

-        Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());
-        return blocks.flatMapToPair(it -> {
-            final SparkReporter reporter = new SparkReporter(accumulators);
-            new BlockProcessor(config).process(it._1(), it._2(), reporter);
-            return reporter.getRelations().iterator();
-        });
+        JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+
+        JavaPairRDD<String, MapDocument> mapDocuments = sc
+                .textFile(entitiesPath)
+                .mapToPair(
+                        (PairFunction<String, String, MapDocument>) s -> {
+                            MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
+                            return new Tuple2<>(d.getIdentifier(), d);
+                        });
+
+        // create blocks for deduplication
+        JavaPairRDD<String, Block> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf);
+
+        // create relations by comparing only elements in the same group
+        JavaRDD<Relation> relations = Deduper.computeRelations(sc, blocks, dedupConf);
+
+        // save the simrel in the workingdir
+        spark
+                .createDataset(relations.rdd(), Encoders.bean(Relation.class))
+                .write()
+                .mode(SaveMode.Overwrite)
+                .save(simRelsPath);
    }

-    /**
-     * @param: the spark context
-     * @param: list of entities: <id, entity>
-     * @param: the dedup configuration
-     *
-     * @return the list of blocks based on clustering of dedup configuration
-     */
-    public static JavaPairRDD<String, Iterable<MapDocument>> createBlocks(JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
+    public static void createMergeRels(DedupConfig dedupConf, String entitiesPath, String mergeRelsPath, String simRelsPath, SparkSession spark){

-        return mapDocs.reduceByKey((a, b) -> a)    //the reduce is just to be sure that we haven't document with same id
-                //Clustering: from <id, doc> to List<groupkey,doc>
-                .flatMapToPair(a -> {
-                    final MapDocument currentDocument = a._2();
+        final int maxIterations = dedupConf.getWf().getMaxIterations();

-                    return Utility.getGroupingKeys(config, currentDocument).stream()
-                            .map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator();
-                }).groupByKey();
+        JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());

+        final JavaPairRDD<Object, String> vertexes = sc
+                .textFile(entitiesPath)
+                .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
+                .mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
+
+        final RDD<Edge<String>> edgeRdd = spark
+                .read()
+                .load(simRelsPath)
+                .as(Encoders.bean(Relation.class))
+                .javaRDD()
+                .map(Relation::toEdgeRdd)
+                .rdd();
+
+        final Dataset<Relation> mergeRels = spark
+                .createDataset(
+                        GraphProcessor
+                                .findCCs(vertexes.rdd(), edgeRdd, maxIterations)
+                                .toJavaRDD()
+                                .filter(k -> k.getDocs().size() > 1)
+                                .flatMap(cc -> ccToMergeRel(cc, dedupConf))
+                                .map(it -> new Relation(it._1(), it._2(), "mergeRel"))
+                                .rdd(),
+                        Encoders.bean(Relation.class));
+
+        mergeRels.write().mode(SaveMode.Overwrite).parquet(mergeRelsPath);
    }

-    /**
-     * @param: the spark context
-     * @param: list of JSON entities
-     * @param: the dedup configuration
-     *
-     * @return the list of vertexes: <id, mapDocument>
-     */
-    public static JavaPairRDD<String, MapDocument> mapToVertexes(JavaSparkContext context, JavaRDD<String> entities, DedupConfig config){
-        return entities.mapToPair(it -> {
-            MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, it);
-            return new Tuple2<>(mapDocument.getIdentifier(), mapDocument);
-        });
+    public static void createDedupEntity(DedupConfig dedupConf, String mergeRelsPath, String entitiesPath, SparkSession spark, String dedupEntityPath){
+
+        JavaPairRDD<String, String> entities = spark
+                .read()
+                .textFile(entitiesPath)
+                .map((MapFunction<String, Tuple2<String, String>>) it ->
+                                new Tuple2<>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it),
+                        Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
+                .toJavaRDD()
+                .mapToPair(t -> t);
+
+        // <source, target>: source is the dedup_id, target is the id of the mergedIn
+        JavaPairRDD<String, Relation> mergeRels = spark
+                .read()
+                .load(mergeRelsPath)
+                .as(Encoders.bean(Relation.class))
+                .toJavaRDD()
+                .mapToPair(r -> new Tuple2<>(r.getTarget(), r));
+
+        JavaRDD<ConnectedComponent> dedupEntities = mergeRels.join(entities)
+                .mapToPair(t -> new Tuple2<>(t._2()._1().getSource(), t._2()._2()))
+                .groupByKey()
+                .map(t-> entityMerger(t._1(), t._2().iterator()));
+
+        dedupEntities.saveAsTextFile(dedupEntityPath);
    }

 }
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/graph/GraphProcessor.scala
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/graph/GraphProcessor.scala
@ -1,6 +1,5 @@
 package eu.dnetlib.graph

-import eu.dnetlib.pace.model.MapDocument
 import eu.dnetlib.support.ConnectedComponent
 import org.apache.spark.graphx._
 import org.apache.spark.rdd.RDD
@ -9,30 +8,29 @@ import scala.collection.JavaConversions;

 object GraphProcessor {

-  def findCCs(vertexes: RDD[(VertexId, MapDocument)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
-    val graph: Graph[MapDocument, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
+  def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
+    val graph: Graph[String, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
    val cc = graph.connectedComponents(maxIterations).vertices

    val joinResult = vertexes.leftOuterJoin(cc).map {
-      case (id, (openaireId, cc)) => {
+      case (id, (rawId, cc)) => {
        if (cc.isEmpty) {
-          (id, openaireId)
+          (id, rawId)
        }
        else {
-          (cc.get, openaireId)
+          (cc.get, rawId)
        }
      }
    }
-
-    val connectedComponents = joinResult.groupByKey().map[ConnectedComponent](cc => asConnectedComponent(cc))
-
-    (connectedComponents)
-
+    val connectedComponents = joinResult.groupByKey()
+      .map[ConnectedComponent](cc => asConnectedComponent(cc))
+    connectedComponents
  }

-  def asConnectedComponent(group: (VertexId, Iterable[MapDocument])): ConnectedComponent = {
-    val docs = group._2.toSet[MapDocument]
-    val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[MapDocument](docs));
+
+  def asConnectedComponent(group: (VertexId, Iterable[String])): ConnectedComponent = {
+    val docs = group._2.toSet[String]
+    val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs));
    connectedComponent
  }

--- a/dnet-dedup-test/src/main/java/eu/dnetlib/pace/utils/BlockUtils.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/pace/utils/BlockUtils.java
@ -135,7 +135,7 @@ public class BlockUtils implements Serializable {

        System.out.println("optimalBlockSize = " + optimalBlockSize);

-        return blocks.filter(b -> b.getElements().size() < optimalBlockSize);
+        return blocks.filter(b -> b.elements() < optimalBlockSize);
    }

    //cut blocks basing on number of comparisons
@ -152,7 +152,7 @@ public class BlockUtils implements Serializable {
        double RATIO = 0.85;

        return blocks
-                .flatMapToPair(b -> b.getElements().stream().map(e -> new Tuple2<>(e, new Tuple2<>(b.getKey(), b.comparisons()))).iterator())
+                .flatMapToPair(b -> b.getDocuments().stream().map(e -> new Tuple2<>(e, new Tuple2<>(b.getKey(), b.comparisons()))).iterator())
                .groupByKey()
                .mapToPair(es -> {
                    List<Tuple2<String, Integer>> b = Lists.newArrayList(es._2());
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/pace/utils/Utility.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/pace/utils/Utility.java
@ -5,6 +5,7 @@ import com.google.common.hash.Hashing;
 import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.model.MapDocument;
+import org.apache.commons.codec.binary.Hex;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
@ -18,6 +19,7 @@ import org.apache.spark.util.LongAccumulator;
 import java.io.IOException;
 import java.io.StringWriter;
 import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Set;
@ -83,4 +85,16 @@ public class Utility {
    public  static long getHashcode(final String id) {
        return Hashing.murmur3_128().hashUnencodedChars(id).asLong();
    }
+
+    public static String md5(final String s) {
+        try {
+            final MessageDigest md = MessageDigest.getInstance("MD5");
+            md.update(s.getBytes(StandardCharsets.UTF_8));
+            return new String(Hex.encodeHex(md.digest()));
+        } catch (final Exception e) {
+            System.err.println("Error creating id");
+            return null;
+        }
+    }
+
 }
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/support/Block.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/support/Block.java
@ -1,29 +1,68 @@
 package eu.dnetlib.support;

-import com.fasterxml.jackson.databind.ObjectMapper;
-import eu.dnetlib.pace.model.MapDocument;
-import eu.dnetlib.pace.util.PaceException;
-
-import java.io.IOException;
 import java.io.Serializable;
-import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.Iterator;
 import java.util.List;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;
 import java.util.stream.StreamSupport;

+import com.google.common.collect.Lists;
+
+import eu.dnetlib.pace.model.MapDocument;
+
 public class Block implements Serializable {

-    String key;
-    List<MapDocument> elements;
+    private String key;

-    public Block(String key, Iterable<MapDocument> elements){
-        this.key = key;
-        this.elements = StreamSupport.stream(elements.spliterator(), false).collect(Collectors.toList());
+    private List<MapDocument> documents;
+
+    public Block() {
+        super();
    }

-    public Block(String key, List<MapDocument> elements){
+    public Block(String key, Iterable<MapDocument> documents) {
        this.key = key;
-        this.elements = elements;
+        this.documents = Lists.newArrayList(documents);
+    }
+
+    public static Block from(String key, MapDocument doc) {
+        Block block = new Block();
+        block.setKey(key);
+        block.setDocuments(Lists.newArrayList(doc));
+        return block;
+    }
+
+    public static Block from(String key, Iterator<Block> blocks, String orderField, int maxSize) {
+        Block block = new Block();
+        block.setKey(key);
+
+        Iterable<Block> it = () -> blocks;
+
+        block
+                .setDocuments(
+                        StreamSupport
+                                .stream(it.spliterator(), false)
+                                .flatMap(b -> b.getDocuments().stream())
+                                .sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue()))
+                                .limit(maxSize)
+                                .collect(Collectors.toCollection(ArrayList::new)));
+        return block;
+    }
+
+    public static Block from(Block b1, Block b2, String orderField, int maxSize) {
+        Block block = new Block();
+        block.setKey(b1.getKey());
+        block
+                .setDocuments(
+                        Stream
+                                .concat(b1.getDocuments().stream(), b2.getDocuments().stream())
+                                .sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue()))
+                                .limit(maxSize)
+                                .collect(Collectors.toCollection(ArrayList::new)));
+        return block;
    }

    public String getKey() {
@ -34,30 +73,22 @@ public class Block implements Serializable {
        this.key = key;
    }

-    public List<MapDocument> getElements() {
-        return elements;
+    public List<MapDocument> getDocuments() {
+        return documents;
    }

-    public void setElements(List<MapDocument> elements) {
-        this.elements = elements;
+    public void setDocuments(List<MapDocument> documents) {
+        this.documents = documents;
    }

-    public int comparisons(){
-        int size = elements.size();
-        return (size*(size-1)/2);
+    public int comparisons() {
+        return (documents.size()*(documents.size()-1))/2;
    }

-    public int elements(){
-        return elements.size();
+    public int elements() {
+        return documents.size();
    }

-    @Override
-    public String toString(){
-        ObjectMapper mapper = new ObjectMapper();
-        try {
-            return mapper.writeValueAsString(this);
-        } catch (IOException e) {
-            throw new PaceException("Failed to create Json: ", e);
-        }
-    }
+
 }
+
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/support/ConnectedComponent.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/support/ConnectedComponent.java
@ -1,89 +1,61 @@
 package eu.dnetlib.support;

-import com.fasterxml.jackson.databind.ObjectMapper;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.MapDocument;
-import eu.dnetlib.pace.util.PaceException;
-import org.codehaus.jackson.annotate.JsonIgnore;
-
 import java.io.IOException;
 import java.io.Serializable;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
 import java.util.Set;
-import java.util.stream.Collectors;
+
+import eu.dnetlib.pace.utils.Utility;
+import org.apache.commons.lang.StringUtils;
+import org.codehaus.jackson.annotate.JsonIgnore;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.pace.util.PaceException;

 public class ConnectedComponent implements Serializable {

-    private Set<MapDocument> docs;
-    private String id;
-    private Map<String, Field> fieldMap;
+    private Set<String> docs;
+    private String ccId;

    public ConnectedComponent() {
    }

-    public ConnectedComponent(Set<MapDocument> docs) {
+    public ConnectedComponent(Set<String> docs) {
        this.docs = docs;
-        this.id = createID(docs);
-        this.fieldMap = chooseFields(docs);
+        createID();
    }

-    public Set<MapDocument> getDocs() {
-        return docs;
-    }
-
-    public void setDocs(Set<MapDocument> docs) {
-        this.docs = docs;
-    }
-
-    public String getId() {
-        return id;
-    }
-
-    public void setId(String id) {
-        this.id = id;
-    }
-
-    public Map<String, Field> chooseFields(Set<MapDocument> docs) {
-
-        int maxLength = 0;
-        Map<String, Field> maxFieldMap = new HashMap<>();
-        for (MapDocument doc : docs) {
-            if (doc.toString().length()>maxLength){
-                maxFieldMap = doc.getFieldMap();
-                maxLength = doc.toString().length();
-            }
-        }
-
-        return maxFieldMap;
-    }
-
-    public String createID(Set<MapDocument> docs) {
+    public String createID() {
        if (docs.size() > 1) {
-            String ccID = getMin(docs.stream().map(doc -> doc.getIdentifier()).collect(Collectors.toList()));
-            String prefix = ccID.split("\\|")[0];
-            String id = ccID.split("::")[1];
-            return prefix + "|dedup_______::" + id;
+            final String s = getMin();
+            ccId = "dedup::" + Utility.md5(s);
+            return ccId;
        } else {
-            return docs.iterator().next().getIdentifier();
+            return docs.iterator().next();
        }
    }

    @JsonIgnore
-    public String getMin(List<String> ids){
+    public String getMin() {

-        String min = ids.get(0);
-        for(String id: ids)
-            if (min.compareTo(id) > 0) {
-                min = id;
-            }
-
-        return min;
+        final StringBuilder min = new StringBuilder();
+        docs
+                .forEach(
+                        i -> {
+                            if (StringUtils.isBlank(min.toString())) {
+                                min.append(i);
+                            } else {
+                                if (min.toString().compareTo(i) > 0) {
+                                    min.setLength(0);
+                                    min.append(i);
+                                }
+                            }
+                        });
+        return min.toString();
    }

    @Override
-    public String toString(){
+    public String toString() {
        ObjectMapper mapper = new ObjectMapper();
        try {
            return mapper.writeValueAsString(this);
@ -92,11 +64,19 @@ public class ConnectedComponent implements Serializable {
        }
    }

-    public Map<String, Field> getFieldMap() {
-        return fieldMap;
+    public Set<String> getDocs() {
+        return docs;
    }

-    public void setFieldMap(Map<String, Field> fieldMap) {
-        this.fieldMap = fieldMap;
+    public void setDocs(Set<String> docs) {
+        this.docs = docs;
+    }
+
+    public String getCcId() {
+        return ccId;
+    }
+
+    public void setCcId(String ccId) {
+        this.ccId = ccId;
    }
 }
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/support/DnetAccumulator.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/support/DnetAccumulator.java
@ -4,7 +4,7 @@ import org.apache.spark.util.AccumulatorV2;

 public class DnetAccumulator extends AccumulatorV2<Long, Long> {

-    private Long counter= 0L;
+    private Long counter = 0L;

    private String group;

--- a/dnet-dedup-test/src/main/java/eu/dnetlib/support/Relation.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/support/Relation.java
@ -0,0 +1,50 @@
+package eu.dnetlib.support;
+
+import eu.dnetlib.Deduper;
+import org.apache.spark.graphx.Edge;
+
+import java.io.Serializable;
+
+public class Relation implements Serializable {
+
+    String source;
+    String target;
+    String type;
+
+    public Relation() {
+    }
+
+    public Relation(String source, String target, String type) {
+        this.source = source;
+        this.target = target;
+        this.type = type;
+    }
+
+    public String getSource() {
+        return source;
+    }
+
+    public void setSource(String source) {
+        this.source = source;
+    }
+
+    public String getTarget() {
+        return target;
+    }
+
+    public void setTarget(String target) {
+        this.target = target;
+    }
+
+    public String getType() {
+        return type;
+    }
+
+    public void setType(String type) {
+        this.type = type;
+    }
+
+    public Edge<String> toEdgeRdd(){
+        return new Edge<>(Deduper.hash(source), Deduper.hash(target), type);
+    }
+}
--- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java
+++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java
--- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestUtils.java
+++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestUtils.java
@ -9,29 +9,29 @@ import java.util.List;
 public abstract class DedupTestUtils {


-    public static void printStatistics(JavaRDD<ConnectedComponent> ccs){
-        final JavaRDD<ConnectedComponent> connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1);
-        final JavaRDD<ConnectedComponent> nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1);
-
-        //print deduped
-        connectedComponents.map(cc -> {
-            StringBuilder sb = new StringBuilder();
-            for (MapDocument m : cc.getDocs()){
-                sb.append(m.getFieldMap().get("originalId").stringValue() + " - "+ m.getFieldMap().get("legalname").stringValue() + "\n");
-            }
-            return sb.toString();
-        }).foreach(s -> System.out.println("*******\n" + s + "*******\n"));
-
-        //print nondeduped
-        nonDeduplicated.foreach(cc -> {
-            System.out.println(cc.getId() + " - " + cc.getFieldMap().get("legalname").stringValue());
-        });
-
-        System.out.println("Non duplicates: " + nonDeduplicated.count());
-        System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());
-        System.out.println("Connected Components: " + connectedComponents.count());
-
-    }
+//    public static void printStatistics(JavaRDD<ConnectedComponent> ccs){
+//        final JavaRDD<ConnectedComponent> connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1);
+//        final JavaRDD<ConnectedComponent> nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1);
+//
+//        //print deduped
+//        connectedComponents.map(cc -> {
+//            StringBuilder sb = new StringBuilder();
+//            for (MapDocument m : cc.getDocs()){
+//                sb.append(m.getFieldMap().get("originalId").stringValue() + " - "+ m.getFieldMap().get("legalname").stringValue() + "\n");
+//            }
+//            return sb.toString();
+//        }).foreach(s -> System.out.println("*******\n" + s + "*******\n"));
+//
+//        //print nondeduped
+//        nonDeduplicated.foreach(cc -> {
+//            System.out.println(cc.getId() + " - " + cc.getFieldMap().get("legalname").stringValue());
+//        });
+//
+//        System.out.println("Non duplicates: " + nonDeduplicated.count());
+//        System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());
+//        System.out.println("Connected Components: " + connectedComponents.count());
+//
+//    }

    public static String getOrganizationLegalname(MapDocument mapDocument){
        return mapDocument.getFieldMap().get("legalname").stringValue();
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/organization/organization.gz
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/organization/organization.gz
--- a/dnet-pace-core/release.properties
+++ b/dnet-pace-core/release.properties
@ -0,0 +1,11 @@
+#release configuration
+#Fri Apr 24 14:37:08 CEST 2020
+scm.tagNameFormat=@{project.artifactId}-@{project.version}
+pushChanges=true
+scm.url=scm\:git\:https\://code-repo.d4science.org/D-Net/dnet-dedup.git/dnet-pace-core
+preparationGoals=clean verify
+projectVersionPolicyId=default
+remoteTagging=true
+scm.commentPrefix=[maven-release-plugin] 
+exec.snapshotReleasePluginAllowed=false
+completedPhase=scm-check-modifications