implementation of the test for the dedup and addition of new support classes

2020-06-11 10:46:46 +02:00 · 2020-06-11 10:46:46 +02:00 · b3ec4194da
parent aa4d03cfa3
commit b3ec4194da
12 changed files with 394 additions and 301 deletions
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java
@ -1,5 +1,6 @@
 package eu.dnetlib;
 import com.google.common.hash.Hashing;
 import eu.dnetlib.graph.GraphProcessor;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.model.MapDocument;
@ -7,105 +8,182 @@ import eu.dnetlib.pace.util.BlockProcessor;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 import eu.dnetlib.pace.utils.Utility;
 import eu.dnetlib.reporter.SparkReporter;
 import eu.dnetlib.support.Block;
 import eu.dnetlib.support.ConnectedComponent;
 import eu.dnetlib.support.Relation;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.Optional;
+import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.graphx.Edge;
 import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.apache.spark.util.LongAccumulator;
 import scala.Serializable;
 import scala.Tuple2;
-import java.util.Map;
+import java.util.*;
 import java.util.stream.Collectors;
 import java.util.stream.StreamSupport;
 public class Deduper implements Serializable {
    private static final Log log = LogFactory.getLog(Deduper.class);
-    /**
+    public static JavaPairRDD<String, Block> createSortedBlocks(
-     * @param: the spark context
+            JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
-     * @param: list of JSON entities to be deduped
+        final String of = config.getWf().getOrderField();
-     * @param: the dedup configuration
+        final int maxQueueSize = config.getWf().getGroupMaxSize();
     *
     * @return the list of connected components generated by the deduplication
     */
    public static JavaRDD<ConnectedComponent> dedup(JavaSparkContext context, JavaRDD<String> entities, DedupConfig config){
        return mapDocs
                // the reduce is just to be sure that we haven't document with same id
                .reduceByKey((a, b) -> a)
                .map(Tuple2::_2)
                // Clustering: from <id, doc> to List<groupkey,doc>
                .flatMap(
                        a -> Utility
                                .getGroupingKeys(config, a)
                                .stream()
                                .map(it -> Block.from(it, a))
                                .collect(Collectors.toList())
                                .iterator())
                .mapToPair(block -> new Tuple2<>(block.getKey(), block))
                .reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize));
    }
    public static Iterator<Tuple2<String, String>> ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf) {
        return cc
                .getDocs()
                .stream()
                .flatMap(
                        id -> {
                            List<Tuple2<String, String>> tmp = new ArrayList<>();
                            tmp.add(new Tuple2<>(cc.getCcId(), id));
                            return tmp.stream();
                        })
                .iterator();
    }
    public static long hash(final String id) {
        return Hashing.murmur3_128().hashString(id).asLong();
    }
    public static ConnectedComponent entityMerger(String key, Iterator<String> values) {
        ConnectedComponent cc = new ConnectedComponent();
        cc.setCcId(key);
        cc.setDocs(StreamSupport.stream(Spliterators.spliteratorUnknownSize(values, Spliterator.ORDERED), false)
                .collect(Collectors.toSet()));
        return cc;
    }
    public static JavaRDD<Relation> computeRelations(
            JavaSparkContext context, JavaPairRDD<String, Block> blocks, DedupConfig config) {
        Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());
-        //create vertexes of the graph: <ID, MapDocument>
+        return blocks
-        JavaPairRDD<String, MapDocument> mapDocs = mapToVertexes(context, entities, config);
+                .flatMapToPair(
-        RDD<Tuple2<Object, MapDocument>> vertexes = mapDocs.mapToPair(t -> new Tuple2<Object, MapDocument>(Utility.getHashcode(t._1()), t._2())).rdd();
+                        it -> {
-
+                            final SparkReporter reporter = new SparkReporter(accumulators);
-        //create blocks for deduplication
+                            new BlockProcessor(config)
-        JavaPairRDD<String, Iterable<MapDocument>> blocks = createBlocks(context, mapDocs, config);
+                                    .processSortedBlock(it._1(), it._2().getDocuments(), reporter);
-
+                            return reporter.getRelations().iterator();
-        //create relations by comparing only elements in the same group
+                        })
-        final JavaPairRDD<String, String> relationRDD = computeRelations(context, blocks, config);
+                .mapToPair(it -> new Tuple2<>(it._1() + it._2(), new Relation(it._1(), it._2(), "simRel")))
-
+                .reduceByKey((a, b) -> a)
-        System.out.println("Number of relations = " + relationRDD.distinct().count());
+                .map(Tuple2::_2);
        final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new Edge<>(Utility.getHashcode(it._1()),Utility.getHashcode(it._2()), "isSimilarTo")).rdd();
        accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value()));
        return GraphProcessor.findCCs(vertexes, edgeRdd, config.getWf().getMaxIterations()).toJavaRDD();
    }
-    /**
+    public static void createSimRels(DedupConfig dedupConf, SparkSession spark, String entitiesPath, String simRelsPath){
     * @param: the spark context
     * @param: list of blocks
     * @param: the dedup configuration
     *
     * @return the list of relations generated by the deduplication
     */
    public static JavaPairRDD<String, String> computeRelations(JavaSparkContext context, JavaPairRDD<String, Iterable<MapDocument>> blocks, DedupConfig config) {
-        Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());
+        JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
-        return blocks.flatMapToPair(it -> {
+
-            final SparkReporter reporter = new SparkReporter(accumulators);
+        JavaPairRDD<String, MapDocument> mapDocuments = sc
-            new BlockProcessor(config).process(it._1(), it._2(), reporter);
+                .textFile(entitiesPath)
-            return reporter.getRelations().iterator();
+                .mapToPair(
-        });
+                        (PairFunction<String, String, MapDocument>) s -> {
                            MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
                            return new Tuple2<>(d.getIdentifier(), d);
                        });
        // create blocks for deduplication
        JavaPairRDD<String, Block> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf);
        // create relations by comparing only elements in the same group
        JavaRDD<Relation> relations = Deduper.computeRelations(sc, blocks, dedupConf);
        // save the simrel in the workingdir
        spark
                .createDataset(relations.rdd(), Encoders.bean(Relation.class))
                .write()
                .mode(SaveMode.Overwrite)
                .save(simRelsPath);
    }
-    /**
+    public static void createMergeRels(DedupConfig dedupConf, String entitiesPath, String mergeRelsPath, String simRelsPath, SparkSession spark){
     * @param: the spark context
     * @param: list of entities: <id, entity>
     * @param: the dedup configuration
     *
     * @return the list of blocks based on clustering of dedup configuration
     */
    public static JavaPairRDD<String, Iterable<MapDocument>> createBlocks(JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
-        return mapDocs.reduceByKey((a, b) -> a)    //the reduce is just to be sure that we haven't document with same id
+        final int maxIterations = dedupConf.getWf().getMaxIterations();
                //Clustering: from <id, doc> to List<groupkey,doc>
                .flatMapToPair(a -> {
                    final MapDocument currentDocument = a._2();
-                    return Utility.getGroupingKeys(config, currentDocument).stream()
+        JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
                            .map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator();
                }).groupByKey();
        final JavaPairRDD<Object, String> vertexes = sc
                .textFile(entitiesPath)
                .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
                .mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
        final RDD<Edge<String>> edgeRdd = spark
                .read()
                .load(simRelsPath)
                .as(Encoders.bean(Relation.class))
                .javaRDD()
                .map(Relation::toEdgeRdd)
                .rdd();
        final Dataset<Relation> mergeRels = spark
                .createDataset(
                        GraphProcessor
                                .findCCs(vertexes.rdd(), edgeRdd, maxIterations)
                                .toJavaRDD()
                                .filter(k -> k.getDocs().size() > 1)
                                .flatMap(cc -> ccToMergeRel(cc, dedupConf))
                                .map(it -> new Relation(it._1(), it._2(), "mergeRel"))
                                .rdd(),
                        Encoders.bean(Relation.class));
        mergeRels.write().mode(SaveMode.Overwrite).parquet(mergeRelsPath);
    }
-    /**
+    public static void createDedupEntity(DedupConfig dedupConf, String mergeRelsPath, String entitiesPath, SparkSession spark, String dedupEntityPath){
-     * @param: the spark context
+
-     * @param: list of JSON entities
+        JavaPairRDD<String, String> entities = spark
-     * @param: the dedup configuration
+                .read()
-     *
+                .textFile(entitiesPath)
-     * @return the list of vertexes: <id, mapDocument>
+                .map((MapFunction<String, Tuple2<String, String>>) it ->
-     */
+                                new Tuple2<>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it),
-    public static JavaPairRDD<String, MapDocument> mapToVertexes(JavaSparkContext context, JavaRDD<String> entities, DedupConfig config){
+                        Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
-        return entities.mapToPair(it -> {
+                .toJavaRDD()
-            MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, it);
+                .mapToPair(t -> t);
-            return new Tuple2<>(mapDocument.getIdentifier(), mapDocument);
+
-        });
+        // <source, target>: source is the dedup_id, target is the id of the mergedIn
        JavaPairRDD<String, Relation> mergeRels = spark
                .read()
                .load(mergeRelsPath)
                .as(Encoders.bean(Relation.class))
                .toJavaRDD()
                .mapToPair(r -> new Tuple2<>(r.getTarget(), r));
        JavaRDD<ConnectedComponent> dedupEntities = mergeRels.join(entities)
                .mapToPair(t -> new Tuple2<>(t._2()._1().getSource(), t._2()._2()))
                .groupByKey()
                .map(t-> entityMerger(t._1(), t._2().iterator()));
        dedupEntities.saveAsTextFile(dedupEntityPath);
    }
 }
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/graph/GraphProcessor.scala
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/graph/GraphProcessor.scala
@ -1,6 +1,5 @@
 package eu.dnetlib.graph
 import eu.dnetlib.pace.model.MapDocument
 import eu.dnetlib.support.ConnectedComponent
 import org.apache.spark.graphx._
 import org.apache.spark.rdd.RDD
@ -9,30 +8,29 @@ import scala.collection.JavaConversions;
 object GraphProcessor {
-  def findCCs(vertexes: RDD[(VertexId, MapDocument)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
+  def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
-    val graph: Graph[MapDocument, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
+    val graph: Graph[String, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
    val cc = graph.connectedComponents(maxIterations).vertices
    val joinResult = vertexes.leftOuterJoin(cc).map {
-      case (id, (openaireId, cc)) => {
+      case (id, (rawId, cc)) => {
        if (cc.isEmpty) {
-          (id, openaireId)
+          (id, rawId)
        }
        else {
-          (cc.get, openaireId)
+          (cc.get, rawId)
        }
      }
    }
-
+    val connectedComponents = joinResult.groupByKey()
-    val connectedComponents = joinResult.groupByKey().map[ConnectedComponent](cc => asConnectedComponent(cc))
+      .map[ConnectedComponent](cc => asConnectedComponent(cc))
-
+    connectedComponents
    (connectedComponents)
  }
-  def asConnectedComponent(group: (VertexId, Iterable[MapDocument])): ConnectedComponent = {
+
-    val docs = group._2.toSet[MapDocument]
+  def asConnectedComponent(group: (VertexId, Iterable[String])): ConnectedComponent = {
-    val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[MapDocument](docs));
+    val docs = group._2.toSet[String]
    val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs));
    connectedComponent
  }
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/pace/utils/BlockUtils.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/pace/utils/BlockUtils.java
@ -135,7 +135,7 @@ public class BlockUtils implements Serializable {
        System.out.println("optimalBlockSize = " + optimalBlockSize);
-        return blocks.filter(b -> b.getElements().size() < optimalBlockSize);
+        return blocks.filter(b -> b.elements() < optimalBlockSize);
    }
    //cut blocks basing on number of comparisons
@ -152,7 +152,7 @@ public class BlockUtils implements Serializable {
        double RATIO = 0.85;
        return blocks
-                .flatMapToPair(b -> b.getElements().stream().map(e -> new Tuple2<>(e, new Tuple2<>(b.getKey(), b.comparisons()))).iterator())
+                .flatMapToPair(b -> b.getDocuments().stream().map(e -> new Tuple2<>(e, new Tuple2<>(b.getKey(), b.comparisons()))).iterator())
                .groupByKey()
                .mapToPair(es -> {
                    List<Tuple2<String, Integer>> b = Lists.newArrayList(es._2());
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/pace/utils/Utility.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/pace/utils/Utility.java
@ -5,6 +5,7 @@ import com.google.common.hash.Hashing;
 import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.model.MapDocument;
 import org.apache.commons.codec.binary.Hex;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
@ -18,6 +19,7 @@ import org.apache.spark.util.LongAccumulator;
 import java.io.IOException;
 import java.io.StringWriter;
 import java.nio.charset.StandardCharsets;
 import java.security.MessageDigest;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Set;
@ -83,4 +85,16 @@ public class Utility {
    public  static long getHashcode(final String id) {
        return Hashing.murmur3_128().hashUnencodedChars(id).asLong();
    }
    public static String md5(final String s) {
        try {
            final MessageDigest md = MessageDigest.getInstance("MD5");
            md.update(s.getBytes(StandardCharsets.UTF_8));
            return new String(Hex.encodeHex(md.digest()));
        } catch (final Exception e) {
            System.err.println("Error creating id");
            return null;
        }
    }
 }
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/support/Block.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/support/Block.java
@ -1,29 +1,68 @@
 package eu.dnetlib.support;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.pace.model.MapDocument;
 import eu.dnetlib.pace.util.PaceException;
 import java.io.IOException;
 import java.io.Serializable;
-import java.util.Arrays;
+import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.Iterator;
 import java.util.List;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
 import com.google.common.collect.Lists;
 import eu.dnetlib.pace.model.MapDocument;
 public class Block implements Serializable {
-    String key;
+    private String key;
    List<MapDocument> elements;
-    public Block(String key, Iterable<MapDocument> elements){
+    private List<MapDocument> documents;
-        this.key = key;
+
-        this.elements = StreamSupport.stream(elements.spliterator(), false).collect(Collectors.toList());
+    public Block() {
        super();
    }
-    public Block(String key, List<MapDocument> elements){
+    public Block(String key, Iterable<MapDocument> documents) {
        this.key = key;
-        this.elements = elements;
+        this.documents = Lists.newArrayList(documents);
    }
    public static Block from(String key, MapDocument doc) {
        Block block = new Block();
        block.setKey(key);
        block.setDocuments(Lists.newArrayList(doc));
        return block;
    }
    public static Block from(String key, Iterator<Block> blocks, String orderField, int maxSize) {
        Block block = new Block();
        block.setKey(key);
        Iterable<Block> it = () -> blocks;
        block
                .setDocuments(
                        StreamSupport
                                .stream(it.spliterator(), false)
                                .flatMap(b -> b.getDocuments().stream())
                                .sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue()))
                                .limit(maxSize)
                                .collect(Collectors.toCollection(ArrayList::new)));
        return block;
    }
    public static Block from(Block b1, Block b2, String orderField, int maxSize) {
        Block block = new Block();
        block.setKey(b1.getKey());
        block
                .setDocuments(
                        Stream
                                .concat(b1.getDocuments().stream(), b2.getDocuments().stream())
                                .sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue()))
                                .limit(maxSize)
                                .collect(Collectors.toCollection(ArrayList::new)));
        return block;
    }
    public String getKey() {
@ -34,30 +73,22 @@ public class Block implements Serializable {
        this.key = key;
    }
-    public List<MapDocument> getElements() {
+    public List<MapDocument> getDocuments() {
-        return elements;
+        return documents;
    }
-    public void setElements(List<MapDocument> elements) {
+    public void setDocuments(List<MapDocument> documents) {
-        this.elements = elements;
+        this.documents = documents;
    }
-    public int comparisons(){
+    public int comparisons() {
-        int size = elements.size();
+        return (documents.size()*(documents.size()-1))/2;
        return (size*(size-1)/2);
    }
-    public int elements(){
+    public int elements() {
-        return elements.size();
+        return documents.size();
    }
-    @Override
+
    public String toString(){
        ObjectMapper mapper = new ObjectMapper();
        try {
            return mapper.writeValueAsString(this);
        } catch (IOException e) {
            throw new PaceException("Failed to create Json: ", e);
        }
    }
 }
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/support/ConnectedComponent.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/support/ConnectedComponent.java
@ -1,89 +1,61 @@
 package eu.dnetlib.support;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.pace.model.Field;
 import eu.dnetlib.pace.model.MapDocument;
 import eu.dnetlib.pace.util.PaceException;
 import org.codehaus.jackson.annotate.JsonIgnore;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
-import java.util.stream.Collectors;
+
 import eu.dnetlib.pace.utils.Utility;
 import org.apache.commons.lang.StringUtils;
 import org.codehaus.jackson.annotate.JsonIgnore;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.pace.util.PaceException;
 public class ConnectedComponent implements Serializable {
-    private Set<MapDocument> docs;
+    private Set<String> docs;
-    private String id;
+    private String ccId;
    private Map<String, Field> fieldMap;
    public ConnectedComponent() {
    }
-    public ConnectedComponent(Set<MapDocument> docs) {
+    public ConnectedComponent(Set<String> docs) {
        this.docs = docs;
-        this.id = createID(docs);
+        createID();
        this.fieldMap = chooseFields(docs);
    }
-    public Set<MapDocument> getDocs() {
+    public String createID() {
        return docs;
    }
    public void setDocs(Set<MapDocument> docs) {
        this.docs = docs;
    }
    public String getId() {
        return id;
    }
    public void setId(String id) {
        this.id = id;
    }
    public Map<String, Field> chooseFields(Set<MapDocument> docs) {
        int maxLength = 0;
        Map<String, Field> maxFieldMap = new HashMap<>();
        for (MapDocument doc : docs) {
            if (doc.toString().length()>maxLength){
                maxFieldMap = doc.getFieldMap();
                maxLength = doc.toString().length();
            }
        }
        return maxFieldMap;
    }
    public String createID(Set<MapDocument> docs) {
        if (docs.size() > 1) {
-            String ccID = getMin(docs.stream().map(doc -> doc.getIdentifier()).collect(Collectors.toList()));
+            final String s = getMin();
-            String prefix = ccID.split("\\|")[0];
+            ccId = "dedup::" + Utility.md5(s);
-            String id = ccID.split("::")[1];
+            return ccId;
            return prefix + "|dedup_______::" + id;
        } else {
-            return docs.iterator().next().getIdentifier();
+            return docs.iterator().next();
        }
    }
    @JsonIgnore
-    public String getMin(List<String> ids){
+    public String getMin() {
-        String min = ids.get(0);
+        final StringBuilder min = new StringBuilder();
-        for(String id: ids)
+        docs
-            if (min.compareTo(id) > 0) {
+                .forEach(
-                min = id;
+                        i -> {
-            }
+                            if (StringUtils.isBlank(min.toString())) {
-
+                                min.append(i);
-        return min;
+                            } else {
                                if (min.toString().compareTo(i) > 0) {
                                    min.setLength(0);
                                    min.append(i);
                                }
                            }
                        });
        return min.toString();
    }
    @Override
-    public String toString(){
+    public String toString() {
        ObjectMapper mapper = new ObjectMapper();
        try {
            return mapper.writeValueAsString(this);
@ -92,11 +64,19 @@ public class ConnectedComponent implements Serializable {
        }
    }
-    public Map<String, Field> getFieldMap() {
+    public Set<String> getDocs() {
-        return fieldMap;
+        return docs;
    }
-    public void setFieldMap(Map<String, Field> fieldMap) {
+    public void setDocs(Set<String> docs) {
-        this.fieldMap = fieldMap;
+        this.docs = docs;
    }
    public String getCcId() {
        return ccId;
    }
    public void setCcId(String ccId) {
        this.ccId = ccId;
    }
 }
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/support/DnetAccumulator.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/support/DnetAccumulator.java
@ -4,7 +4,7 @@ import org.apache.spark.util.AccumulatorV2;
 public class DnetAccumulator extends AccumulatorV2<Long, Long> {
-    private Long counter= 0L;
+    private Long counter = 0L;
    private String group;
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/support/Relation.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/support/Relation.java
@ -0,0 +1,50 @@
 package eu.dnetlib.support;
 import eu.dnetlib.Deduper;
 import org.apache.spark.graphx.Edge;
 import java.io.Serializable;
 public class Relation implements Serializable {
    String source;
    String target;
    String type;
    public Relation() {
    }
    public Relation(String source, String target, String type) {
        this.source = source;
        this.target = target;
        this.type = type;
    }
    public String getSource() {
        return source;
    }
    public void setSource(String source) {
        this.source = source;
    }
    public String getTarget() {
        return target;
    }
    public void setTarget(String target) {
        this.target = target;
    }
    public String getType() {
        return type;
    }
    public void setType(String type) {
        this.type = type;
    }
    public Edge<String> toEdgeRdd(){
        return new Edge<>(Deduper.hash(source), Deduper.hash(target), type);
    }
 }
--- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java
+++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java
--- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestUtils.java
+++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestUtils.java
@ -9,29 +9,29 @@ import java.util.List;
 public abstract class DedupTestUtils {
-    public static void printStatistics(JavaRDD<ConnectedComponent> ccs){
+//    public static void printStatistics(JavaRDD<ConnectedComponent> ccs){
-        final JavaRDD<ConnectedComponent> connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1);
+//        final JavaRDD<ConnectedComponent> connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1);
-        final JavaRDD<ConnectedComponent> nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1);
+//        final JavaRDD<ConnectedComponent> nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1);
-
+//
-        //print deduped
+//        //print deduped
-        connectedComponents.map(cc -> {
+//        connectedComponents.map(cc -> {
-            StringBuilder sb = new StringBuilder();
+//            StringBuilder sb = new StringBuilder();
-            for (MapDocument m : cc.getDocs()){
+//            for (MapDocument m : cc.getDocs()){
-                sb.append(m.getFieldMap().get("originalId").stringValue() + " - "+ m.getFieldMap().get("legalname").stringValue() + "\n");
+//                sb.append(m.getFieldMap().get("originalId").stringValue() + " - "+ m.getFieldMap().get("legalname").stringValue() + "\n");
-            }
+//            }
-            return sb.toString();
+//            return sb.toString();
-        }).foreach(s -> System.out.println("*******\n" + s + "*******\n"));
+//        }).foreach(s -> System.out.println("*******\n" + s + "*******\n"));
-
+//
-        //print nondeduped
+//        //print nondeduped
-        nonDeduplicated.foreach(cc -> {
+//        nonDeduplicated.foreach(cc -> {
-            System.out.println(cc.getId() + " - " + cc.getFieldMap().get("legalname").stringValue());
+//            System.out.println(cc.getId() + " - " + cc.getFieldMap().get("legalname").stringValue());
-        });
+//        });
-
+//
-        System.out.println("Non duplicates: " + nonDeduplicated.count());
+//        System.out.println("Non duplicates: " + nonDeduplicated.count());
-        System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());
+//        System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());
-        System.out.println("Connected Components: " + connectedComponents.count());
+//        System.out.println("Connected Components: " + connectedComponents.count());
-
+//
-    }
+//    }
    public static String getOrganizationLegalname(MapDocument mapDocument){
        return mapDocument.getFieldMap().get("legalname").stringValue();
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/organization/organization.gz
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/organization/organization.gz
--- a/dnet-pace-core/release.properties
+++ b/dnet-pace-core/release.properties
@ -0,0 +1,11 @@
 #release configuration
 #Fri Apr 24 14:37:08 CEST 2020
 scm.tagNameFormat=@{project.artifactId}-@{project.version}
 pushChanges=true
 scm.url=scm\:git\:https\://code-repo.d4science.org/D-Net/dnet-dedup.git/dnet-pace-core
 preparationGoals=clean verify
 projectVersionPolicyId=default
 remoteTagging=true
 scm.commentPrefix=[maven-release-plugin] 
 exec.snapshotReleasePluginAllowed=false
 completedPhase=scm-check-modifications