67 changed files with 2484 additions and 8101 deletions
--- a/.gitignore
+++ b/.gitignore
@ -19,5 +19,3 @@
 /build
 spark-warehouse
 /dhp-workflows/dhp-graph-mapper/job-override.properties
-test.properties
-
--- a/dhp-build/dhp-build-assembly-resources/pom.xml
+++ b/dhp-build/dhp-build-assembly-resources/pom.xml
@ -6,7 +6,7 @@
    <parent>
        <groupId>eu.dnetlib</groupId>
        <artifactId>dhp-build</artifactId>
-        <version>4.1.13-SNAPSHOT</version>
+        <version>4.1.12</version>
    </parent>

    <artifactId>dhp-build-assembly-resources</artifactId>
--- a/dhp-build/dhp-build-properties-maven-plugin/pom.xml
+++ b/dhp-build/dhp-build-properties-maven-plugin/pom.xml
@ -6,11 +6,10 @@
    <parent>
        <groupId>eu.dnetlib</groupId>
        <artifactId>dhp-build</artifactId>
-        <version>4.1.13-SNAPSHOT</version>
+        <version>4.1.12</version>
    </parent>

    <artifactId>dhp-build-properties-maven-plugin</artifactId>
-    <version>4.1.13-SNAPSHOT</version>
    <packaging>maven-plugin</packaging>

    <description>This module is a maven plugin implementing custom properties substitutions in the build lifecycle</description>
@ -20,19 +19,16 @@
            <groupId>org.apache.maven</groupId>
            <artifactId>maven-plugin-api</artifactId>
            <version>3.6.3</version>
-            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.maven</groupId>
            <artifactId>maven-project</artifactId>
            <version>2.2.1</version>
-            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.maven</groupId>
            <artifactId>maven-artifact</artifactId>
            <version>2.2.1</version>
-            <scope>provided</scope>
        </dependency>

        <dependency>
@ -104,29 +100,6 @@
                </configuration>
            </plugin>
        </plugins>
-
-        <pluginManagement>
-            <plugins>
-                <plugin>
-                    <groupId>org.apache.maven.plugins</groupId>
-                    <artifactId>maven-plugin-plugin</artifactId>
-                    <version>3.2</version>
-                    <configuration>
-                        <skipErrorNoDescriptorsFound>true</skipErrorNoDescriptorsFound>
-                    </configuration>
-                    <executions>
-                        <execution>
-                            <id>mojo-descriptor</id>
-                            <phase>process-classes</phase>
-                            <goals>
-                                <goal>descriptor</goal>
-                            </goals>
-                        </execution>
-                    </executions>
-                </plugin>
-            </plugins>
-        </pluginManagement>
-
    </build>

 </project>
--- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java
+++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java
@ -8,8 +8,6 @@ import static org.junit.jupiter.api.Assertions.assertNull;

 import org.junit.jupiter.api.*;

-import java.nio.file.Paths;
-
 /** @author mhorst, claudio.atzori */
 public class GenerateOoziePropertiesMojoTest {

@ -68,7 +66,7 @@ public class GenerateOoziePropertiesMojoTest {
 		clearSystemProperties();

 		// given
-		String workflowSourceDir = Paths.get("eu/dnetlib/dhp/").toString();
+		String workflowSourceDir = "eu/dnetlib/dhp/";
 		System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);

 		// execute
@ -83,14 +81,14 @@ public class GenerateOoziePropertiesMojoTest {

 		clearSystemProperties();
 		// given
-		String workflowSourceDir = Paths.get("eu/dnetlib/dhp/wf/transformers").toString();
+		String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers";
 		System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);

 		// execute
 		mojo.execute();

 		// assert
-		assertEquals(Paths.get("wf/transformers").toString(), System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
+		assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
 	}

 	@Test
@ -98,13 +96,13 @@ public class GenerateOoziePropertiesMojoTest {

 		clearSystemProperties();
 		// given
-		String workflowSourceDir = Paths.get("wf/transformers").toString();
+		String workflowSourceDir = "wf/transformers";
 		System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);

 		// execute
 		mojo.execute();

 		// assert
-		assertEquals(Paths.get("wf/transformers").toString(), System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
+		assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
 	}
 }
--- a/dhp-build/dhp-build-properties-maven-plugin/test.properties
+++ b/dhp-build/dhp-build-properties-maven-plugin/test.properties
@ -0,0 +1,2 @@
+# Tue Mar 15 14:58:05 CET 2022
+projectPropertyKey=projectPropertyValue
--- a/dhp-build/dhp-code-style/pom.xml
+++ b/dhp-build/dhp-code-style/pom.xml
@ -5,7 +5,7 @@

    <groupId>eu.dnetlib</groupId>
    <artifactId>dhp-code-style</artifactId>
-    <version>4.1.13-SNAPSHOT</version>
+    <version>4.1.12</version>

    <packaging>jar</packaging>

--- a/dhp-build/pom.xml
+++ b/dhp-build/pom.xml
@ -4,7 +4,7 @@
 	<parent>
 		<groupId>eu.dnetlib</groupId>
 		<artifactId>dnet-dedup</artifactId>
-		<version>4.1.13-SNAPSHOT</version>
+		<version>4.1.12</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 	<artifactId>dhp-build</artifactId>
--- a/dnet-dedup-test/job-override.properties
+++ b/dnet-dedup-test/job-override.properties
@ -1,6 +1,6 @@
-useTree = true
-entitiesPath = /user/michele.debonis/lda_experiments/authors_pubmed
-workingPath = /user/michele.debonis/authors_dedup/gt2_dedup
-numPartitions = 1000
-dedupConfPath = /user/michele.debonis/lda_experiments/authors.fdup.gt2.conf.json
-groundTruthFieldJPath = $.orcid
+entitiesPath = /tmp/publications_test_dump
+#entitiesPath = /tmp/prod_provision/graph/02_graph_cleaned/publication
+workingPath = /user/michele.debonis/new_dedup_test/workingdirtree
+dedupConfPath = /user/michele.debonis/new_dedup_test/pubs.tree.conf.json
+numPartitions = 8000
+useTree = true
--- a/dnet-dedup-test/pom.xml
+++ b/dnet-dedup-test/pom.xml
@ -6,7 +6,7 @@
    <parent>
        <groupId>eu.dnetlib</groupId>
        <artifactId>dnet-dedup</artifactId>
-        <version>4.1.13-SNAPSHOT</version>
+        <version>4.1.12</version>
        <relativePath>../pom.xml</relativePath>
    </parent>

--- a/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java
@ -1,7 +1,7 @@
 package eu.dnetlib;

 import com.google.common.hash.Hashing;
-import eu.dnetlib.graph.JavaGraphProcessor;
+import eu.dnetlib.graph.GraphProcessor;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.model.MapDocument;
 import eu.dnetlib.pace.util.BlockProcessorForTesting;
@ -19,6 +19,7 @@ import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.graphx.Edge;
+import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
@ -57,13 +58,14 @@ public class Deduper implements Serializable {
                .reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize));
    }

-    public static Iterator<Tuple2<String, String>> ccToMergeRel(Tuple2<String, List<String>> cc, DedupConfig dedupConf) {
-        return cc._2()
+    public static Iterator<Tuple2<String, String>> ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf) {
+        return cc
+                .getDocs()
                .stream()
                .flatMap(
                        id -> {
                            List<Tuple2<String, String>> tmp = new ArrayList<>();
-                            tmp.add(new Tuple2<>(cc._1(), id));
+                            tmp.add(new Tuple2<>(cc.getCcId(), id));
                            return tmp.stream();
                        })
                .iterator();
@ -136,19 +138,21 @@ public class Deduper implements Serializable {
                .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
                .mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));

-        final JavaRDD<Edge<String>> edgeRdd = spark
+        final RDD<Edge<String>> edgeRdd = spark
                .read()
                .load(simRelsPath)
                .as(Encoders.bean(Relation.class))
                .javaRDD()
-                .map(Relation::toEdgeRdd);
+                .map(Relation::toEdgeRdd)
+                .rdd();

-        JavaPairRDD<String, List<String>> ccs = JavaGraphProcessor
-                .findCCs(vertexes, edgeRdd, dedupConf.getWf().getMaxIterations());
+        JavaRDD<ConnectedComponent> ccs = GraphProcessor
+                .findCCs(vertexes.rdd(), edgeRdd, maxIterations)
+                .toJavaRDD();

        JavaRDD<Relation> mergeRel = ccs
-                .filter(cc -> cc._2().size() > 1)
-                .flatMap(cc -> Deduper.ccToMergeRel(cc, dedupConf))
+                .filter(k -> k.getDocs().size() > 1)
+                .flatMap(cc -> ccToMergeRel(cc, dedupConf))
                .map(it -> new Relation(it._1(), it._2(), "mergeRel"));

        final Dataset<Relation> mergeRels = spark
@ -159,7 +163,7 @@ public class Deduper implements Serializable {
        mergeRels.write().mode(SaveMode.Overwrite).parquet(mergeRelsPath);
    }

-    public static void createDedupEntity(DedupConfig dedupConf, String simRelsPath, String mergeRelsPath, String entitiesPath, SparkSession spark, String dedupEntityPath){
+    public static void createDedupEntity(DedupConfig dedupConf, String mergeRelsPath, String entitiesPath, SparkSession spark, String dedupEntityPath){

        JavaPairRDD<String, String> entities = spark
                .read()
@ -170,15 +174,7 @@ public class Deduper implements Serializable {
                .toJavaRDD()
                .mapToPair(t -> t);

-        // <source_raw_id, relation(source, target)>
-        JavaPairRDD<String, Relation> simRels = spark
-                .read()
-                .load(simRelsPath)
-                .as(Encoders.bean(Relation.class))
-                .toJavaRDD()
-                .mapToPair(r-> new Tuple2<>(r.getSource(), r));
-
-        // <raw_id, relation(dedup_id, raw_id)>
+        // <source, target>: source is the dedup_id, target is the id of the mergedIn
        JavaPairRDD<String, Relation> mergeRels = spark
                .read()
                .load(mergeRelsPath)
@ -191,22 +187,7 @@ public class Deduper implements Serializable {
                .groupByKey()
                .map(t-> entityMerger(t._1(), t._2().iterator()));

-        JavaPairRDD<String, Iterable<Relation>> simRelsWithDedupId = simRels
-                .join(mergeRels)
-                .mapToPair(x -> new Tuple2<>(x._2()._2().getSource(), x._2()._1()))
-                .groupByKey();
-
-        JavaRDD<ConnectedComponent> groupEntity = mergeRels.join(entities)
-                .mapToPair(t -> new Tuple2<>(t._2()._1().getSource(), t._2()._2()))
-                .groupByKey()
-                .join(simRelsWithDedupId)
-                .map(x -> new ConnectedComponent(
-                        x._1(),
-                        x._2()._1(),
-                        x._2()._2())
-                );
-
-        groupEntity.saveAsTextFile(dedupEntityPath);
+        dedupEntities.saveAsTextFile(dedupEntityPath);
    }

 }
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/graph/JavaGraphProcessor.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/graph/JavaGraphProcessor.java
@ -1,56 +0,0 @@
-package eu.dnetlib.graph;
-
-import com.clearspring.analytics.util.Lists;
-import com.google.common.collect.Sets;
-import eu.dnetlib.pace.utils.Utility;
-import eu.dnetlib.support.ConnectedComponent;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.graphx.*;
-import org.apache.spark.rdd.RDD;
-import org.apache.spark.storage.StorageLevel;
-import scala.Tuple2;
-import scala.reflect.ClassTag;
-import scala.reflect.ClassTag$;
-
-import java.util.List;
-
-public class JavaGraphProcessor {
-
-    //<ccId, list(json)>
-    public static JavaPairRDD<String, List<String>> findCCs(JavaPairRDD<Object, String> vertexes, JavaRDD<Edge<String>> edges, int maxIterations) {
-
-        ClassTag<String> stringTag = ClassTag$.MODULE$.apply(String.class);
-        Graph<String, String> graph =
-                Graph.apply(
-                        vertexes.rdd(),
-                        edges.rdd(),
-                        "",
-                        StorageLevel.MEMORY_ONLY(),
-                        StorageLevel.MEMORY_ONLY(),
-                        stringTag,
-                        stringTag
-                );
-
-        GraphOps<String, String> graphOps = new GraphOps<>(graph, stringTag, stringTag);
-        JavaRDD<Tuple2<Object, Object>> cc = graphOps.connectedComponents(maxIterations).vertices().toJavaRDD();
-
-        JavaPairRDD<Object, String> joinResult = vertexes
-                .leftOuterJoin(cc.mapToPair(x -> x))
-                .mapToPair(x -> {
-                    if (!x._2()._2().isPresent()) {
-                        return new Tuple2<>(x._1(), x._2()._1());
-                    } else {
-                        return new Tuple2<>(x._2()._2(), x._2()._1());
-                    }
-                });
-
-        return joinResult
-                .groupByKey()
-                .map(x -> Lists.newArrayList(x._2()))
-                .zipWithUniqueId()
-                .mapToPair(x -> new Tuple2<>("dedup______::" + x._2().toString(), x._1()));
-
-    }
-
-}
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java
@ -19,7 +19,6 @@ import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.Serializable;
-import java.nio.charset.StandardCharsets;
 import java.util.stream.Collectors;

 public abstract class AbstractSparkJob implements Serializable {
@ -60,7 +59,7 @@ public abstract class AbstractSparkJob implements Serializable {

        Path path=new Path(filePath);
        FileSystem fs = FileSystem.get(new Configuration());
-        BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path), StandardCharsets.UTF_8));
+        BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path)));
        try {
            return String.join("", br.lines().collect(Collectors.toList()));
        } finally {
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkComputeStatistics.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkComputeStatistics.java
@ -1,36 +1,20 @@
 package eu.dnetlib.jobs;

-import eu.dnetlib.Deduper;
-import eu.dnetlib.pace.config.DedupConfig;
-import eu.dnetlib.pace.config.Type;
-import eu.dnetlib.pace.model.FieldValueImpl;
-import eu.dnetlib.pace.model.MapDocument;
-import eu.dnetlib.pace.util.MapDocumentUtil;
-import eu.dnetlib.pace.utils.Utility;
 import eu.dnetlib.support.ArgumentApplicationParser;
-import eu.dnetlib.support.Block;
-import eu.dnetlib.support.ConnectedComponent;
 import eu.dnetlib.support.Relation;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
-import org.codehaus.jackson.map.ObjectMapper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import scala.Tuple2;

 import java.io.IOException;
-import java.util.List;
 import java.util.Optional;
-import java.util.stream.Collectors;

 public class SparkComputeStatistics extends AbstractSparkJob {

@ -58,42 +42,18 @@ public class SparkComputeStatistics extends AbstractSparkJob {

        @Override
        public void run() throws IOException {
-            //https://towardsdatascience.com/7-evaluation-metrics-for-clustering-algorithms-bdc537ff54d2#:~:text=There%20are%20two%20types%20of,to%20all%20unsupervised%20learning%20results)
+
            // read oozie parameters
            final String entitiesPath = parser.get("entitiesPath");
            final String workingPath = parser.get("workingPath");
-            final String dedupConfPath = parser.get("dedupConfPath");
-            final String groundTruthFieldJPath = parser.get("groundTruthFieldJPath");
            final int numPartitions = Optional
                    .ofNullable(parser.get("numPartitions"))
                    .map(Integer::valueOf)
                    .orElse(NUM_PARTITIONS);

-            log.info("entitiesPath:          '{}'", entitiesPath);
-            log.info("workingPath:           '{}'", workingPath);
-            log.info("numPartitions:         '{}'", numPartitions);
-            log.info("dedupConfPath:         '{}'", dedupConfPath);
-            log.info("groundTruthFieldJPath: '{}'", groundTruthFieldJPath);
-
-            JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
-            DedupConfig dedupConfig = loadDedupConfig(dedupConfPath);
-
-            JavaPairRDD<String, MapDocument> mapDocuments = sc
-                    .textFile(entitiesPath)
-                    .repartition(numPartitions)
-                    .mapToPair(
-                            (PairFunction<String, String, MapDocument>) s -> {
-                                MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, s);
-                                //put in the map the groundTruthField used to compute statistics
-                                d.getFieldMap().put("groundTruth", new FieldValueImpl(Type.String, "groundTruth", MapDocumentUtil.getJPathString(groundTruthFieldJPath, s)));
-                                return new Tuple2<>(d.getIdentifier(), d);
-                            });
-
-            JavaRDD<String> entities = mapDocuments.map(d -> d._2().getFieldMap().get("groundTruth").stringValue());
-
-            // create blocks
-            JavaRDD<List<String>> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConfig)
-                    .map(b -> b._2().getDocuments().stream().map(d -> d.getFieldMap().get("groundTruth").stringValue()).collect(Collectors.toList()));
+            log.info("entitiesPath:  '{}'", entitiesPath);
+            log.info("workingPath:   '{}'", workingPath);
+            log.info("numPartitions: '{}'", numPartitions);

            // <source, target>: source is the dedup_id, target is the id of the mergedIn
            JavaRDD<Relation> mergerels = spark
@ -108,38 +68,15 @@ public class SparkComputeStatistics extends AbstractSparkJob {
                    .as(Encoders.bean(Relation.class))
                    .toJavaRDD();

-            JavaRDD<List<String>> groups = sc.textFile(workingPath + "/groupentities")
-                    .map(e -> new ObjectMapper().readValue(e, ConnectedComponent.class))
-                    .map(e -> e.getDocs().stream().map(d -> MapDocumentUtil.getJPathString(groundTruthFieldJPath, d)).collect(Collectors.toList()));
-
-            long entities_number = entities.count();
-            long blocks_number = blocks.count();
-            double blocks_randIndex = randIndex(blocks);
            long simrels_number = simrels.count();
            long mergerels_number = mergerels.count();
-            double groups_randIndex = randIndex(groups);
-            long groups_number = groups.count();
-            long groundtruth_number = entities.filter(e -> !e.isEmpty()).count();
-            long correct_groups = groups.filter(x -> x.stream().distinct().count()==1).count();
-            long wrong_groups = groups_number - correct_groups;
+            long connected_components = mergerels.groupBy(Relation::getSource).count();

-            String print =
-                    "Entities : " + entities_number + "\n" +
-                    "Ground Truth : " + groundtruth_number + "\n" +
-                    "Blocks : " + blocks_number + "\n" +
-                    "Blocks RI : " + blocks_randIndex + "\n" +
-                    "SimRels : " + simrels_number + "\n" +
-                    "MergeRels : " + mergerels_number + "\n" +
-                    "Groups : " + groups_number + " (correct: " + correct_groups + ", wrong: " + wrong_groups + ")\n" +
-                    "Groups RI : " + groups_randIndex;
-
-            System.out.println(print);
-
-            writeStatsFileToHDFS(groundtruth_number, entities_number, blocks_randIndex, groups_randIndex, blocks_number, simrels_number, mergerels_number, groups_number, workingPath + "/stats_file.txt");
+            writeStatsFileToHDFS(simrels_number, mergerels_number, connected_components, workingPath + "/stats_file");

        }

-        public static void writeStatsFileToHDFS(long groundtruth_number, long entities_number, double blocks_randIndex, double groups_randIndex, long blocks_number, long simrels_number, long mergerels_number, long groups_number, String filePath) throws IOException {
+        public static void writeStatsFileToHDFS(long simrels_number, long mergerels_number, long connected_components, String filePath) throws IOException {
            Configuration conf = new Configuration();

            FileSystem fs = FileSystem.get(conf);
@ -156,14 +93,9 @@ public class SparkComputeStatistics extends AbstractSparkJob {
                }

                String print =
-                        "Entities : " + entities_number + "\n" +
-                        "Ground Truth : " + groundtruth_number + "\n" +
-                        "Blocks : " + blocks_number + "\n" +
-                        "Blocks RI : " + blocks_randIndex + "\n" +
-                        "SimRels : " + simrels_number + "\n" +
-                        "MergeRels : " + mergerels_number + "\n" +
-                        "Groups : " + groups_number + "\n" +
-                        "Groups RI : " + groups_randIndex;
+                        "Similarity Relations : " + simrels_number + "\n" +
+                        "Merge Relations : " + mergerels_number + "\n" +
+                        "Connected Components : " + connected_components;

                // Create file to write
                FSDataOutputStream out = fs.create(outFile);
@ -177,31 +109,5 @@ public class SparkComputeStatistics extends AbstractSparkJob {
                e.printStackTrace();
            }
        }
-
-        //TODO find another maesure that takes into account all the elements outside of the group too
-        //RandIndex = number of pairwise correct predictions/total number of possible pairs (in the same cluster) -> bounded between 0 and 1
-        public double randIndex(JavaRDD<List<String>> clusters) {
-
-            Tuple2<Integer, Integer> reduce = clusters.map(c -> {
-                        int num = 0;
-                        for (String id : c.stream().distinct().filter(s -> !s.isEmpty()).collect(Collectors.toList())) {
-                            int n = (int) c.stream().filter(i -> i.equals(id)).count();
-                            num += binomialCoefficient(n);
-                        }
-                        int den = binomialCoefficient(c.size());
-                        return new Tuple2<>(num, den);
-                    })
-                    .reduce((a, b) -> new Tuple2<>(a._1() + b._1(), a._2() + b._2()));
-
-            return (double)reduce._1()/ reduce._2();
-        }
-
-        private static int binomialCoefficient(int n)
-        {
-            return n*(n-1)/2;
-        }
-
-        //V-measure = harmonic mean of homogeneity and completeness, homogeneity = each cluster contains only members of a single class, completeness = all members of a given class are assigned to the same cluster
-
 }

--- a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateDedupEntity.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateDedupEntity.java
@ -7,7 +7,6 @@ import eu.dnetlib.pace.utils.Utility;
 import eu.dnetlib.support.ArgumentApplicationParser;
 import eu.dnetlib.support.ConnectedComponent;
 import eu.dnetlib.support.Relation;
-import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
@ -17,32 +16,29 @@ import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import scala.Tuple2;
-import scala.Tuple3;

 import java.io.IOException;
-import java.util.*;
-import java.util.stream.Collectors;
-import java.util.stream.StreamSupport;
+import java.util.Optional;

-public class SparkCreateGroupEntity extends AbstractSparkJob {
+public class SparkCreateDedupEntity extends AbstractSparkJob {

-        private static final Logger log = LoggerFactory.getLogger(SparkCreateGroupEntity.class);
+        private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.jobs.SparkCreateDedupEntity.class);

-        public SparkCreateGroupEntity(ArgumentApplicationParser parser, SparkSession spark) {
+        public SparkCreateDedupEntity(ArgumentApplicationParser parser, SparkSession spark) {
            super(parser, spark);
        }

        public static void main(String[] args) throws Exception {

            ArgumentApplicationParser parser = new ArgumentApplicationParser(
-                    Utility.readResource("/jobs/parameters/createGroupEntity_parameters.json", SparkCreateGroupEntity.class)
+                    Utility.readResource("/jobs/parameters/createDedupEntity_parameters.json", SparkCreateDedupEntity.class)
            );

            parser.parseArgument(args);

            SparkConf conf = new SparkConf();

-            new SparkCreateGroupEntity(
+            new SparkCreateDedupEntity(
                    parser,
                    getSparkSession(conf)
            ).run();
@ -67,7 +63,6 @@ public class SparkCreateGroupEntity extends AbstractSparkJob {

            DedupConfig dedupConf = DedupConfig.load(readFileFromHDFS(dedupConfPath));

-            // <raw_id, json>
            JavaPairRDD<String, String> entities = spark
                    .read()
                    .textFile(entitiesPath)
@ -77,15 +72,7 @@ public class SparkCreateGroupEntity extends AbstractSparkJob {
                    .toJavaRDD()
                    .mapToPair(t -> t);

-            // <source_raw_id, relation(source, target)>
-            JavaPairRDD<String, Relation> simRels = spark
-                    .read()
-                    .load(workingPath + "/simrels")
-                    .as(Encoders.bean(Relation.class))
-                    .toJavaRDD()
-                    .mapToPair(r-> new Tuple2<>(r.getSource(), r));
-
-            // <raw_id, relation(dedup_id, raw_id)>
+            // <source, target>: source is the dedup_id, target is the id of the mergedIn
            JavaPairRDD<String, Relation> mergeRels = spark
                    .read()
                    .load(workingPath + "/mergerels")
@ -93,23 +80,12 @@ public class SparkCreateGroupEntity extends AbstractSparkJob {
                    .toJavaRDD()
                    .mapToPair(r -> new Tuple2<>(r.getTarget(), r));

-            // <dedup_id, simrel>
-            JavaPairRDD<String, Iterable<Relation>> simRelsWithDedupId = simRels
-                    .join(mergeRels)
-                    .mapToPair(x -> new Tuple2<>(x._2()._2().getSource(), x._2()._1()))
-                    .groupByKey();
-
-            JavaRDD<ConnectedComponent> groupEntity = mergeRels.join(entities)
+            JavaRDD<ConnectedComponent> dedupEntities = mergeRels.join(entities)
                    .mapToPair(t -> new Tuple2<>(t._2()._1().getSource(), t._2()._2()))
                    .groupByKey()
-                    .join(simRelsWithDedupId)
-                    .map(x -> new ConnectedComponent(
-                            x._1(),
-                            x._2()._1(),
-                            x._2()._2())
-                    );
+                    .map(t-> Deduper.entityMerger(t._1(), t._2().iterator()));

-            groupEntity.saveAsTextFile(workingPath + "/groupentities", GzipCodec.class);
+            dedupEntities.saveAsTextFile(workingPath + "dedupentity");

        }

--- a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateMergeRels.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateMergeRels.java
@ -1,7 +1,7 @@
 package eu.dnetlib.jobs;

 import eu.dnetlib.Deduper;
-import eu.dnetlib.graph.JavaGraphProcessor;
+import eu.dnetlib.graph.GraphProcessor;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 import eu.dnetlib.pace.utils.Utility;
@ -24,7 +24,6 @@ import org.slf4j.LoggerFactory;
 import scala.Tuple2;

 import java.io.IOException;
-import java.util.List;
 import java.util.Optional;

 import static eu.dnetlib.Deduper.hash;
@ -79,18 +78,20 @@ public class SparkCreateMergeRels extends AbstractSparkJob {
                .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
                .mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));

-        final JavaRDD<Edge<String>> edgeRdd = spark
+        final RDD<Edge<String>> edgeRdd = spark
                .read()
                .load(workingPath + "/simrels")
                .as(Encoders.bean(Relation.class))
                .javaRDD()
-                .map(Relation::toEdgeRdd);
+                .map(Relation::toEdgeRdd)
+                .rdd();

-        JavaPairRDD<String, List<String>> ccs = JavaGraphProcessor
-                .findCCs(vertexes, edgeRdd, dedupConf.getWf().getMaxIterations());
+        JavaRDD<ConnectedComponent> ccs = GraphProcessor
+                .findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations())
+                .toJavaRDD();

        JavaRDD<Relation> mergeRel = ccs
-                .filter(cc -> cc._2().size() > 1)
+                .filter(k -> k.getDocs().size() > 1)
                .flatMap(cc -> Deduper.ccToMergeRel(cc, dedupConf))
                .map(it -> new Relation(it._1(), it._2(), "mergeRel"));

--- a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateSimRels.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateSimRels.java
@ -14,7 +14,6 @@ import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.sql.Encoder;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/support/Block.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/support/Block.java
@ -1,7 +1,10 @@
 package eu.dnetlib.support;

 import java.io.Serializable;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
@ -9,7 +12,6 @@ import java.util.stream.StreamSupport;
 import com.google.common.collect.Lists;

 import eu.dnetlib.pace.model.MapDocument;
-import org.codehaus.jackson.annotate.JsonIgnore;

 public class Block implements Serializable {

@ -21,11 +23,6 @@ public class Block implements Serializable {
        super();
    }

-    public Block(String key, List<MapDocument> documents) {
-        this.key = key;
-        this.documents = documents;
-    }
-
    public Block(String key, Iterable<MapDocument> documents) {
        this.key = key;
        this.documents = Lists.newArrayList(documents);
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/support/ConnectedComponent.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/support/ConnectedComponent.java
@ -5,35 +5,54 @@ import java.io.Serializable;
 import java.util.HashSet;
 import java.util.Set;

-import com.google.common.collect.Sets;
+import eu.dnetlib.pace.utils.Utility;
+import org.apache.commons.lang.StringUtils;
+import org.codehaus.jackson.annotate.JsonIgnore;
+
+import com.fasterxml.jackson.databind.ObjectMapper;

 import eu.dnetlib.pace.util.PaceException;
-import org.codehaus.jackson.map.ObjectMapper;

 public class ConnectedComponent implements Serializable {

    private HashSet<String> docs;
    private String ccId;
-    private HashSet<Relation> simrels;

    public ConnectedComponent() {
    }

-    public ConnectedComponent(String ccId, Set<String> docs, Set<Relation> simrels) {
-        this.docs = new HashSet<>(docs);
-        this.ccId = ccId;
-        this.simrels = new HashSet<>(simrels);
-    }
-
    public ConnectedComponent(Set<String> docs) {
        this.docs = new HashSet<>(docs);
-        //initialization of id and relations missing
+        createID();
    }

-    public ConnectedComponent(String ccId, Iterable<String> docs, Iterable<Relation> simrels) {
-        this.ccId = ccId;
-        this.docs = Sets.newHashSet(docs);
-        this.simrels = Sets.newHashSet(simrels);
+    public String createID() {
+        if (docs.size() > 1) {
+            final String s = getMin();
+            ccId = "dedup::" + Utility.md5(s);
+            return ccId;
+        } else {
+            return docs.iterator().next();
+        }
+    }
+
+    @JsonIgnore
+    public String getMin() {
+
+        final StringBuilder min = new StringBuilder();
+        docs
+                .forEach(
+                        i -> {
+                            if (StringUtils.isBlank(min.toString())) {
+                                min.append(i);
+                            } else {
+                                if (min.toString().compareTo(i) > 0) {
+                                    min.setLength(0);
+                                    min.append(i);
+                                }
+                            }
+                        });
+        return min.toString();
    }

    @Override
@ -61,12 +80,4 @@ public class ConnectedComponent implements Serializable {
    public void setCcId(String ccId) {
        this.ccId = ccId;
    }
-
-    public void setSimrels(HashSet<Relation> simrels) {
-        this.simrels = simrels;
-    }
-
-    public HashSet<Relation> getSimrels() {
-        return simrels;
-    }
 }
--- a/dnet-dedup-test/src/main/resources/dedup/oozie_app/workflow.xml
+++ b/dnet-dedup-test/src/main/resources/dedup/oozie_app/workflow.xml
@ -16,10 +16,6 @@
            <name>dedupConfPath</name>
            <description>path for the dedup configuration file</description>
        </property>
-        <property>
-            <name>groundTruthFieldJPath</name>
-            <description>jpath of the field to be used as ground truth</description>
-        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -142,33 +138,6 @@
            <arg>--numPartitions</arg><arg>${numPartitions}</arg>
            <arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
        </spark>
-        <ok to="CreateGroupEntities"/>
-        <error to="Kill"/>
-    </action>
-
-
-    <action name="CreateGroupEntities">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Create Group Entities</name>
-            <class>eu.dnetlib.jobs.SparkCreateGroupEntity</class>
-            <jar>dnet-dedup-test-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
-            </spark-opts>
-            <arg>--entitiesPath</arg><arg>${entitiesPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--numPartitions</arg><arg>${numPartitions}</arg>
-            <arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
-        </spark>
        <ok to="ComputeStatistics"/>
        <error to="Kill"/>
    </action>
@ -193,12 +162,36 @@
            <arg>--entitiesPath</arg><arg>${entitiesPath}</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>--numPartitions</arg><arg>${numPartitions}</arg>
-            <arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
-            <arg>--groundTruthFieldJPath</arg><arg>${groundTruthFieldJPath}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>

+    <!--<action name="CreateDedupEntities">-->
+        <!--<spark xmlns="uri:oozie:spark-action:0.2">-->
+            <!--<master>yarn</master>-->
+            <!--<mode>cluster</mode>-->
+            <!--<name>Create Dedup Entities</name>-->
+            <!--<class>eu.dnetlib.jobs.SparkCreateDedupEntity</class>-->
+            <!--<jar>dnet-dedup-test-${projectVersion}.jar</jar>-->
+            <!--<spark-opts>-->
+                <!--&#45;&#45;executor-memory=${sparkExecutorMemory}-->
+                <!--&#45;&#45;executor-cores=${sparkExecutorCores}-->
+                <!--&#45;&#45;driver-memory=${sparkDriverMemory}-->
+                <!--&#45;&#45;conf spark.extraListeners=${spark2ExtraListeners}-->
+                <!--&#45;&#45;conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
+                <!--&#45;&#45;conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
+                <!--&#45;&#45;conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
+                <!--&#45;&#45;conf spark.sql.shuffle.partitions=3840-->
+            <!--</spark-opts>-->
+            <!--<arg>&#45;&#45;entitiesPath</arg><arg>${entitiesPath}</arg>-->
+            <!--<arg>&#45;&#45;workingPath</arg><arg>${workingPath}</arg>-->
+            <!--<arg>&#45;&#45;numPartitions</arg><arg>${numPartitions}</arg>-->
+            <!--<arg>&#45;&#45;dedupConfPath</arg><arg>${dedupConfPath}</arg>-->
+        <!--</spark>-->
+        <!--<ok to="End"/>-->
+        <!--<error to="Kill"/>-->
+    <!--</action>-->
+
    <end name="End"/>
 </workflow-app>
--- a/dnet-dedup-test/src/main/resources/jobs/parameters/computeStatistics_parameters.json
+++ b/dnet-dedup-test/src/main/resources/jobs/parameters/computeStatistics_parameters.json
@ -16,17 +16,5 @@
    "paramLongName": "numPartitions",
    "paramDescription": "number of partitions for the similarity relations intermediate phases",
    "paramRequired": false
-  },
-  {
-    "paramName": "dc",
-    "paramLongName": "dedupConfPath",
-    "paramDescription": "dedup configuration to be used",
-    "paramRequired": true
-  },
-  {
-    "paramName": "gt",
-    "paramLongName": "groundTruthFieldJPath",
-    "paramDescription": "field to be used as groundtruth",
-    "paramRequired": true
  }
 ]
--- a/dnet-dedup-test/src/main/resources/jobs/parameters/createDedupEntity_parameters.json
+++ b/dnet-dedup-test/src/main/resources/jobs/parameters/createDedupEntity_parameters.json
--- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java
+++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.conf.json
@ -1,134 +0,0 @@
-{
-  "wf": {
-    "threshold": "0.99",
-    "dedupRun": "001",
-    "entityType": "author",
-    "subEntityType": "author",
-    "subEntityValue": "author",
-    "orderField": "fullname",
-    "queueMaxSize": "200",
-    "groupMaxSize": "100",
-    "maxChildren": "100",
-    "slidingWindowSize": "50",
-    "rootBuilder": [
-      "result",
-      "resultProject_outcome_isProducedBy",
-      "resultResult_publicationDataset_isRelatedTo",
-      "resultResult_similarity_isAmongTopNSimilarDocuments",
-      "resultResult_similarity_hasAmongTopNSimilarDocuments",
-      "resultOrganization_affiliation_isAffiliatedWith",
-      "resultResult_part_hasPart",
-      "resultResult_part_isPartOf",
-      "resultResult_supplement_isSupplementTo",
-      "resultResult_supplement_isSupplementedBy",
-      "resultResult_version_isVersionOf"
-    ],
-    "includeChildren": "true",
-    "maxIterations": 20,
-    "idPath": "$.id"
-  },
-  "pace": {
-    "clustering" : [
-      { "name" : "lnfi", "fields" : [ "fullname" ], "params" : {} }
-    ],
-    "decisionTree": {
-      "start": {
-        "fields": [
-          {
-            "field": "orcid",
-            "comparator": "exactMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "orcids",
-        "ignoreUndefined": "true"
-      },
-      "orcids": {
-        "fields": [
-          {
-            "field": "orcids",
-            "comparator": "stringListMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {"type": "count"}
-          }
-        ],
-        "threshold": 3.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "coauthors",
-        "undefined": "coauthors",
-        "ignoreUndefined": "true"
-      },
-      "coauthors": {
-        "fields": [
-          {
-            "field": "coauthors",
-            "comparator": "authorsMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {"type": "count"}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "topicsMatch",
-        "negative": "NO_MATCH",
-        "undefined": "topicsMatch",
-        "ignoreUndefined": "true"
-      },
-      "topicsMatch": {
-        "fields": [
-          {
-            "field": "topics",
-            "comparator": "cosineSimilarity",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {}
-          }
-        ],
-        "threshold": 0.5,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "NO_MATCH",
-        "ignoreUndefined": "false"
-      }
-    },
-    "model": [
-      {
-        "name": "topics",
-        "type": "DoubleArray",
-        "path": "$.topics"
-      },
-      {
-        "name": "fullname",
-        "type": "String",
-        "path": "$.fullname"
-      },
-      {
-        "name": "orcid",
-        "type": "String",
-        "path": "$.orcid"
-      },
-      {
-        "name": "coauthors",
-        "type": "List",
-        "path": "$.coAuthors[*].fullname"
-      },
-      {
-        "name": "orcids",
-        "type": "List",
-        "path": "$.coAuthors[*].orcid"
-      }
-    ],
-    "blacklists": {},
-    "synonyms": {}
-  }
-}
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.soft.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.soft.conf.json
@ -1,134 +0,0 @@
-{
-  "wf": {
-    "threshold": "0.99",
-    "dedupRun": "001",
-    "entityType": "author",
-    "subEntityType": "author",
-    "subEntityValue": "author",
-    "orderField": "fullname",
-    "queueMaxSize": "200",
-    "groupMaxSize": "100",
-    "maxChildren": "100",
-    "slidingWindowSize": "50",
-    "rootBuilder": [
-      "result",
-      "resultProject_outcome_isProducedBy",
-      "resultResult_publicationDataset_isRelatedTo",
-      "resultResult_similarity_isAmongTopNSimilarDocuments",
-      "resultResult_similarity_hasAmongTopNSimilarDocuments",
-      "resultOrganization_affiliation_isAffiliatedWith",
-      "resultResult_part_hasPart",
-      "resultResult_part_isPartOf",
-      "resultResult_supplement_isSupplementTo",
-      "resultResult_supplement_isSupplementedBy",
-      "resultResult_version_isVersionOf"
-    ],
-    "includeChildren": "true",
-    "maxIterations": 20,
-    "idPath": "$.id"
-  },
-  "pace": {
-    "clustering" : [
-      { "name" : "lnfi", "fields" : [ "fullname" ], "params" : {} }
-    ],
-    "decisionTree": {
-      "start": {
-        "fields": [
-          {
-            "field": "orcid",
-            "comparator": "exactMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "orcids",
-        "ignoreUndefined": "true"
-      },
-      "orcids": {
-        "fields": [
-          {
-            "field": "orcids",
-            "comparator": "stringListMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {"type": "count"}
-          }
-        ],
-        "threshold": 3.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "coauthors",
-        "undefined": "coauthors",
-        "ignoreUndefined": "true"
-      },
-      "coauthors": {
-        "fields": [
-          {
-            "field": "coauthors",
-            "comparator": "authorsMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {"type": "count"}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "topicsMatch",
-        "negative": "NO_MATCH",
-        "undefined": "topicsMatch",
-        "ignoreUndefined": "true"
-      },
-      "topicsMatch": {
-        "fields": [
-          {
-            "field": "topics",
-            "comparator": "cosineSimilarity",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "NO_MATCH",
-        "ignoreUndefined": "false"
-      }
-    },
-    "model": [
-      {
-        "name": "topics",
-        "type": "DoubleArray",
-        "path": "$.topics"
-      },
-      {
-        "name": "fullname",
-        "type": "String",
-        "path": "$.fullname"
-      },
-      {
-        "name": "orcid",
-        "type": "String",
-        "path": "$.orcid"
-      },
-      {
-        "name": "coauthors",
-        "type": "List",
-        "path": "$.coAuthors[*].fullname"
-      },
-      {
-        "name": "orcids",
-        "type": "List",
-        "path": "$.coAuthors[*].orcid"
-      }
-    ],
-    "blacklists": {},
-    "synonyms": {}
-  }
-}
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/ds.tree.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/ds.tree.conf.json
@ -3,7 +3,7 @@
    "threshold" : "0.99",
    "dedupRun" : "001",
    "entityType" : "datasource",
-    "orderField" : "englishname",
+    "orderField" : "name",
    "queueMaxSize" : "2000",
    "groupMaxSize" : "50",
    "slidingWindowSize" : "200",
@ -14,9 +14,8 @@
  },
  "pace" : {
    "clustering" : [
-      { "name" : "sortedngrampairs", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 2, "ngramLen" : "3", "collapseOn:name": "0"} },
-      { "name" : "suffixprefix", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 1, "len" : "3", "collapseOn:name": "0" } },
-      {"name" : "ngrams", "fields" : ["officialname", "englishname"], "params" : {"ngramLen": 4, "max" : 2, "maxPerToken": 2, "minNgramLen": 1, "collapseOn:name": "0"}},
+      { "name" : "sortedngrampairs", "fields" : [ "name" ], "params" : { "max" : 2, "ngramLen" : "3"} },
+      { "name" : "suffixprefix", "fields" : [ "name" ], "params" : { "max" : 1, "len" : "3" } },
      { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
    ],
    "decisionTree" : {
@ -40,36 +39,16 @@
      "layer2": {
        "fields": [
          {
-            "field": "officialname",
+            "field": "name",
            "comparator": "levensteinTitle",
            "weight": 1.0,
            "countIfUndefined": "true",
            "params": {
-              "threshold": 0.9
-            }
-          },
-          {
-            "field": "englishname",
-            "comparator": "levensteinTitle",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {
-              "threshold": 0.9
-            }
-          },
-          {
-            "field": "officialname",
-            "comparator": "levensteinTitle",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {
-              "crossCompare": "englishname",
-              "threshold": 0.9
            }
          }
        ],
        "threshold": 0.9,
-        "aggregation": "MAX",
+        "aggregation": "AVG",
        "positive": "MATCH",
        "negative": "NO_MATCH",
        "undefined": "NO_MATCH",
@ -77,11 +56,12 @@
      }
    },
    "model" : [
-      { "name" : "englishname", "type" : "String", "path" : "$.englishname" },
-      { "name" : "officialname", "type" : "String", "path" : "$.officialname" },
+      { "name" : "name", "type" : "String", "path" : "$.name" },
      { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" }
    ],
-    "blacklists" : {},
+    "blacklists" : {
+      "legalname" : []
+    },
    "synonyms": {}
  }
 }
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pub.prod.tree.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pub.prod.tree.conf.json
@ -51,6 +51,37 @@
    ],
    "decisionTree": {
      "start": {
+        "fields": [
+          {
+            "field": "pid",
+            "comparator": "jsonListMatch",
+            "weight": 1.0,
+            "countIfUndefined": "false",
+            "params": {
+              "jpath_value": "$.value",
+              "jpath_classid": "$.qualifier.classid"
+            }
+          },
+          {
+            "field": "pid",
+            "comparator": "jsonListMatch",
+            "weight": 1.0,
+            "countIfUndefined": "false",
+            "params": {
+              "jpath_value": "$.value",
+              "jpath_classid": "$.qualifier.classid",
+              "crossCompare": "alternateid"
+            }
+          }
+        ],
+        "threshold": 0.5,
+        "aggregation": "MAX",
+        "positive": "layer1",
+        "negative": "layer2",
+        "undefined": "layer2",
+        "ignoreUndefined": "true"
+      },
+      "layer1": {
        "fields": [
          {
            "field": "title",
@ -63,8 +94,49 @@
        "threshold": 0.9,
        "aggregation": "AVG",
        "positive": "MATCH",
-        "negative": "MATCH",
-        "undefined": "MATCH",
+        "negative": "NO_MATCH",
+        "undefined": "NO_MATCH",
+        "ignoreUndefined": "true"
+      },
+      "layer2": {
+        "fields": [
+          {
+            "field": "title",
+            "comparator": "titleVersionMatch",
+            "weight": 1.0,
+            "countIfUndefined": "false",
+            "params": {}
+          },
+          {
+            "field": "authors",
+            "comparator": "sizeMatch",
+            "weight": 1.0,
+            "countIfUndefined": "false",
+            "params": {}
+          }
+        ],
+        "threshold": 1.0,
+        "aggregation": "AND",
+        "positive": "layer3",
+        "negative": "NO_MATCH",
+        "undefined": "layer3",
+        "ignoreUndefined": "false"
+      },
+      "layer3": {
+        "fields": [
+          {
+            "field": "title",
+            "comparator": "levensteinTitle",
+            "weight": 1.0,
+            "countIfUndefined": "true",
+            "params": {}
+          }
+        ],
+        "threshold": 0.99,
+        "aggregation": "AVG",
+        "positive": "MATCH",
+        "negative": "NO_MATCH",
+        "undefined": "NO_MATCH",
        "ignoreUndefined": "true"
      }
    },
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json
@ -6,9 +6,9 @@
    "subEntityType": "resulttype",
    "subEntityValue": "publication",
    "orderField": "title",
-    "queueMaxSize": "200",
-    "groupMaxSize": "100",
-    "maxChildren": "100",
+    "queueMaxSize": "5000",
+    "groupMaxSize": "2000",
+    "maxChildren": "1000",
    "slidingWindowSize": "50",
    "rootBuilder": [
      "result",
@ -28,26 +28,9 @@
    "idPath": "$.id"
  },
  "pace": {
-    "clustering": [
-      {
-        "name": "wordsStatsSuffixPrefixChain",
-        "fields": [
-          "title"
-        ],
-        "params": {
-          "mod": "10"
-        }
-      },
-      {
-        "name": "lowercase",
-        "fields": [
-          "doi",
-          "altdoi"
-        ],
-        "params": {
-          "collapseOn:pid": "0"
-        }
-      }
+    "clustering" : [
+      { "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } },
+      { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
    ],
    "decisionTree": {
      "start": {
@ -59,75 +42,18 @@
            "countIfUndefined": "false",
            "params": {
              "jpath_value": "$.value",
-              "jpath_classid": "$.qualifier.classid",
-              "mode": "count"
+              "jpath_classid": "$.qualifier.classid"
            }
          }
        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "instanceTypeCheck",
-        "undefined": "instanceTypeCheck",
-        "ignoreUndefined": "false"
-      },
-      "instanceTypeCheck": {
-        "fields": [
-          {
-            "field": "instance",
-            "comparator": "instanceTypeMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {}
-          }
-        ],
        "threshold": 0.5,
-        "aggregation": "MAX",
-        "positive": "pidVSaltid",
-        "negative": "NO_MATCH",
-        "undefined": "pidVSaltid",
-        "ignoreUndefined": "true"
-      },
-      "pidVSaltid": {
-        "fields": [
-          {
-            "field": "pid",
-            "comparator": "jsonListMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {
-              "jpath_value": "$.value",
-              "jpath_classid": "$.qualifier.classid",
-              "crossCompare": "alternateid",
-              "mode": "count"
-            }
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "softCheck",
-        "negative": "earlyExits",
-        "undefined": "earlyExits",
-        "ignoreUndefined": "true"
-      },
-      "softCheck": {
-        "fields": [
-          {
-            "field": "title",
-            "comparator": "levensteinTitle",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {}
-          }
-        ],
-        "threshold": 0.9,
        "aggregation": "AVG",
        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "NO_MATCH",
+        "negative": "layer2",
+        "undefined": "layer2",
        "ignoreUndefined": "true"
      },
-      "earlyExits": {
+      "layer2": {
        "fields": [
          {
            "field": "title",
@ -146,12 +72,12 @@
        ],
        "threshold": 1.0,
        "aggregation": "AND",
-        "positive": "strongCheck",
+        "positive": "layer3",
        "negative": "NO_MATCH",
-        "undefined": "strongCheck",
+        "undefined": "layer3",
        "ignoreUndefined": "false"
      },
-      "strongCheck": {
+      "layer3": {
        "fields": [
          {
            "field": "title",
@ -163,30 +89,9 @@
        ],
        "threshold": 0.99,
        "aggregation": "AVG",
-        "positive": "surnames",
-        "negative": "NO_MATCH",
-        "undefined": "NO_MATCH",
-        "ignoreUndefined": "true"
-      },
-      "surnames": {
-        "fields": [
-          {
-            "field": "authors",
-            "comparator": "authorsMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {
-              "surname_th": 0.75,
-              "fullname_th": 0.75,
-              "mode": "full"
-            }
-          }
-        ],
-        "threshold": 0.6,
-        "aggregation": "MAX",
        "positive": "MATCH",
        "negative": "NO_MATCH",
-        "undefined": "MATCH",
+        "undefined": "NO_MATCH",
        "ignoreUndefined": "true"
      }
    },
@ -194,29 +99,18 @@
      {
        "name": "doi",
        "type": "String",
-        "path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value"
-      },
-      {
-        "name": "altdoi",
-        "type": "String",
-        "path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
+        "path": "$.pid[?(@.qualifier.classid == 'doi')].value"
      },
      {
        "name": "pid",
        "type": "JSON",
-        "path": "$.instance[*].pid[*]",
-        "overrideMatch": "true"
-      },
-      {
-        "name": "alternateid",
-        "type": "JSON",
-        "path": "$.instance[*].alternateIdentifier[*]",
+        "path": "$.pid",
        "overrideMatch": "true"
      },
      {
        "name": "title",
-        "type": "StringConcat",
-        "path": "$.title[?(@.qualifier.classid == 'main title')].value|||$.title[?(@.qualifier.classid == 'subtitle')].value",
+        "type": "String",
+        "path": "$.title[?(@.qualifier.classid == 'main title')].value",
        "length": 250,
        "size": 5
      },
@ -230,11 +124,6 @@
        "name": "resulttype",
        "type": "String",
        "path": "$.resulttype.classid"
-      },
-      {
-        "name": "instance",
-        "type": "List",
-        "path": "$.instance[*].instancetype.classname"
      }
    ],
    "blacklists": {
@ -465,16 +354,7 @@
        "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
        "^(Measurement of the spin\\-dependent structure function).*",
        "(?i)^.*authors['’′]? reply\\.?$",
-        "(?i)^.*authors['’′]? response\\.?$",
-        "^Data [mM]anagement [sS]ervices\\.$",
-        "Research and Advanced Technology for Digital Libraries",
-        "(?i)^risky business$",
-        "(?i)^great expectations\\.?$",
-        "(?i)^what's in a name\\?$",
-        "(?i)^decisions, decisions\\.?$",
-        "(?i)^update to our reader, reviewer, and author communities.*",
-        "(?i)^lest we forget$",
-        "(?i)^measure for measure$"
+        "(?i)^.*authors['’′]? response\\.?$"
      ]
    },
    "synonyms": {}
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pubs.fdup.exp.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pubs.fdup.exp.json
@ -1,381 +0,0 @@
-{
-  "wf": {
-    "threshold": "0.99",
-    "dedupRun": "001",
-    "entityType": "result",
-    "subEntityType": "resulttype",
-    "subEntityValue": "publication",
-    "orderField": "title",
-    "queueMaxSize": "200",
-    "groupMaxSize": "100",
-    "maxChildren": "100",
-    "slidingWindowSize": "100",
-    "rootBuilder": [
-      "result",
-      "resultProject_outcome_isProducedBy",
-      "resultResult_publicationDataset_isRelatedTo",
-      "resultResult_similarity_isAmongTopNSimilarDocuments",
-      "resultResult_similarity_hasAmongTopNSimilarDocuments",
-      "resultOrganization_affiliation_isAffiliatedWith",
-      "resultResult_part_hasPart",
-      "resultResult_part_isPartOf",
-      "resultResult_supplement_isSupplementTo",
-      "resultResult_supplement_isSupplementedBy",
-      "resultResult_version_isVersionOf"
-    ],
-    "includeChildren": "true",
-    "maxIterations": 20,
-    "idPath": "$.id"
-  },
-  "pace": {
-    "clustering" : [
-      { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
-      { "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid": "0"} }
-    ],
-    "decisionTree": {
-      "start": {
-        "fields": [
-          {
-            "field": "pid",
-            "comparator": "jsonListMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {
-              "jpath_value": "$.value",
-              "jpath_classid": "$.qualifier.classid",
-               "mode": "count"
-            }
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "versionCheck",
-        "undefined": "versionCheck",
-        "ignoreUndefined": "true"
-      },
-      "versionCheck": {
-        "fields": [
-          {
-            "field": "title",
-            "comparator": "titleVersionMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "titleCheck",
-        "negative": "NO_MATCH",
-        "undefined": "titleCheck",
-        "ignoreUndefined": "false"
-      },
-      "titleCheck": {
-        "fields": [
-          {
-            "field": "title",
-            "comparator": "levensteinTitle",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {}
-          }
-        ],
-        "threshold": 0.9,
-        "aggregation": "MAX",
-        "positive": "authorsCheck",
-        "negative": "NO_MATCH",
-        "undefined": "NO_MATCH",
-        "ignoreUndefined": "true"
-      },
-      "authorsCheck": {
-        "fields": [
-          {
-            "field": "authors",
-            "comparator": "authorsMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {}
-          }
-        ],
-        "threshold": 0.6,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "MATCH",
-        "ignoreUndefined": "false"
-      }
-    },
-    "model": [
-      {
-        "name": "doi",
-        "type": "String",
-        "path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value"
-      },
-      {
-        "name": "altdoi",
-        "type": "String",
-        "path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
-      },
-      {
-        "name": "pid",
-        "type": "JSON",
-        "path": "$.instance[*].pid[*]",
-        "overrideMatch": "true"
-      },
-      {
-        "name": "title",
-        "type": "String",
-        "path": "$.title[?(@.qualifier.classid == 'main title')].value",
-        "length": 250,
-        "size": 5
-      },
-      {
-        "name": "authors",
-        "type": "List",
-        "path": "$.author[*].fullname",
-        "size": 200
-      },
-      {
-        "name": "resulttype",
-        "type": "String",
-        "path": "$.resulttype.classid"
-      }
-    ],
-    "blacklists": {
-      "title": [
-        "(?i)^Data Management Plan",
-        "^Inside Front Cover$",
-        "(?i)^Poster presentations$",
-        "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
-        "^Problems with perinatal pathology\\.?$",
-        "(?i)^Cases? of Puerperal Convulsions$",
-        "(?i)^Operative Gyna?ecology$",
-        "(?i)^Mind the gap\\!?\\:?$",
-        "^Chronic fatigue syndrome\\.?$",
-        "^Cartas? ao editor Letters? to the Editor$",
-        "^Note from the Editor$",
-        "^Anesthesia Abstract$",
-        "^Annual report$",
-        "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
-        "(?i)^Graph and Table of Infectious Diseases?$",
-        "^Presentation$",
-        "(?i)^Reviews and Information on Publications$",
-        "(?i)^PUBLIC HEALTH SERVICES?$",
-        "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
-        "(?i)^Adrese autora$",
-        "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
-        "(?i)^Acknowledgement to Referees$",
-        "(?i)^Behçet's disease\\.?$",
-        "(?i)^Isolation and identification of restriction endonuclease.*$",
-        "(?i)^CEREBROVASCULAR DISEASES?.?$",
-        "(?i)^Screening for abdominal aortic aneurysms?\\.?$",
-        "^Event management$",
-        "(?i)^Breakfast and Crohn's disease.*\\.?$",
-        "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
-        "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
-        "^Gushi hakubutsugaku$",
-        "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
-        "^Intestinal spirocha?etosis$",
-        "^Treatment of Rodent Ulcer$",
-        "(?i)^\\W*Cloud Computing\\W*$",
-        "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
-        "^Free Communications, Poster Presentations: Session [A-F]$",
-        "^“The Historical Aspects? of Quackery\\.?”$",
-        "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
-        "^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
-        "(?i)^Case Report$",
-        "^Boletín Informativo$",
-        "(?i)^Glioblastoma Multiforme$",
-        "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
-        "^Zaměstnanecké výhody$",
-        "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
-        "(?i)^Carotid body tumours?\\.?$",
-        "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
-        "^Avant-propos$",
-        "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
-        "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
-        "(?i)^PUBLIC HEALTH VERSUS THE STATE$",
-        "^Viñetas de Cortázar$",
-        "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
-        "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
-        "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
-        "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
-        "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
-        "^Aus der AGMB$",
-        "^Znanstveno-stručni prilozi$",
-        "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
-        "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
-        "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
-        "^Finanční analýza podniku$",
-        "^Financial analysis( of business)?$",
-        "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
-        "^Jikken nihon shūshinsho$",
-        "(?i)^CORONER('|s)(s|') INQUESTS$",
-        "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
-        "(?i)^Consultants' contract(s)?$",
-        "(?i)^Upute autorima$",
-        "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
-        "^Joshi shin kokubun$",
-        "^Kōtō shōgaku dokuhon nōson'yō$",
-        "^Jinjō shōgaku shōka$",
-        "^Shōgaku shūjichō$",
-        "^Nihon joshi dokuhon$",
-        "^Joshi shin dokuhon$",
-        "^Chūtō kanbun dokuhon$",
-        "^Wabun dokuhon$",
-        "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
-        "(?i)^cardiac rehabilitation$",
-        "(?i)^Analytical summary$",
-        "^Thesaurus resolutionum Sacrae Congregationis Concilii$",
-        "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
-        "^Prikazi i osvrti$",
-        "^Rodinný dům s provozovnou$",
-        "^Family house with an establishment$",
-        "^Shinsei chūtō shin kokugun$",
-        "^Pulmonary alveolar proteinosis(\\.?)$",
-        "^Shinshū kanbun$",
-        "^Viñeta(s?) de Rodríguez$",
-        "(?i)^RUBRIKA UREDNIKA$",
-        "^A Matching Model of the Academic Publication Market$",
-        "^Yōgaku kōyō$",
-        "^Internetový marketing$",
-        "^Internet marketing$",
-        "^Chūtō kokugo dokuhon$",
-        "^Kokugo dokuhon$",
-        "^Antibiotic Cover for Dental Extraction(s?)$",
-        "^Strategie podniku$",
-        "^Strategy of an Enterprise$",
-        "(?i)^respiratory disease(s?)(\\.?)$",
-        "^Award(s?) for Gallantry in Civil Defence$",
-        "^Podniková kultura$",
-        "^Corporate Culture$",
-        "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
-        "^Pracovní motivace$",
-        "^Work Motivation$",
-        "^Kaitei kōtō jogaku dokuhon$",
-        "^Konsolidovaná účetní závěrka$",
-        "^Consolidated Financial Statements$",
-        "(?i)^intracranial tumour(s?)$",
-        "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
-        "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
-        "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
-        "^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
-        "^Úroveň motivačního procesu jako způsobu vedení lidí$",
-        "^The level of motivation process as a leadership$",
-        "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
-        "(?i)^news and events$",
-        "(?i)^NOVOSTI I DOGAĐAJI$",
-        "^Sansū no gakushū$",
-        "^Posouzení informačního systému firmy a návrh změn$",
-        "^Information System Assessment and Proposal for ICT Modification$",
-        "^Stresové zatížení pracovníků ve vybrané profesi$",
-        "^Stress load in a specific job$",
-        "^Sunday: Poster Sessions, Pt.*$",
-        "^Monday: Poster Sessions, Pt.*$",
-        "^Wednesday: Poster Sessions, Pt.*",
-        "^Tuesday: Poster Sessions, Pt.*$",
-        "^Analýza reklamy$",
-        "^Analysis of advertising$",
-        "^Shōgaku shūshinsho$",
-        "^Shōgaku sansū$",
-        "^Shintei joshi kokubun$",
-        "^Taishō joshi kokubun dokuhon$",
-        "^Joshi kokubun$",
-        "^Účetní uzávěrka a účetní závěrka v ČR$",
-        "(?i)^The \"?Causes\"? of Cancer$",
-        "^Normas para la publicación de artículos$",
-        "^Editor('|s)(s|') [Rr]eply$",
-        "^Editor(’|s)(s|’) letter$",
-        "^Redaktoriaus žodis$",
-        "^DISCUSSION ON THE PRECEDING PAPER$",
-        "^Kōtō shōgaku shūshinsho jidōyō$",
-        "^Shōgaku nihon rekishi$",
-        "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
-        "^Préface$",
-        "^Occupational [Hh]ealth [Ss]ervices.$",
-        "^In Memoriam Professor Toshiyuki TAKESHIMA$",
-        "^Účetní závěrka ve vybraném podniku.*$",
-        "^Financial statements in selected company$",
-        "^Abdominal [Aa]ortic [Aa]neurysms.*$",
-        "^Pseudomyxoma peritonei$",
-        "^Kazalo autora$",
-        "(?i)^uvodna riječ$",
-        "^Motivace jako způsob vedení lidí$",
-        "^Motivation as a leadership$",
-        "^Polyfunkční dům$",
-        "^Multi\\-funkcional building$",
-        "^Podnikatelský plán$",
-        "(?i)^Podnikatelský záměr$",
-        "(?i)^Business Plan$",
-        "^Oceňování nemovitostí$",
-        "^Marketingová komunikace$",
-        "^Marketing communication$",
-        "^Sumario Analítico$",
-        "^Riječ uredništva$",
-        "^Savjetovanja i priredbe$",
-        "^Índice$",
-        "^(Starobosanski nadpisi).*$",
-        "^Vzdělávání pracovníků v organizaci$",
-        "^Staff training in organization$",
-        "^(Life Histories of North American Geometridae).*$",
-        "^Strategická analýza podniku$",
-        "^Strategic Analysis of an Enterprise$",
-        "^Sadržaj$",
-        "^Upute suradnicima$",
-        "^Rodinný dům$",
-        "(?i)^Fami(l)?ly house$",
-        "^Upute autorima$",
-        "^Strategic Analysis$",
-        "^Finanční analýza vybraného podniku$",
-        "^Finanční analýza$",
-        "^Riječ urednika$",
-        "(?i)^Content(s?)$",
-        "(?i)^Inhalt$",
-        "^Jinjō shōgaku shūshinsho jidōyō$",
-        "(?i)^Index$",
-        "^Chūgaku kokubun kyōkasho$",
-        "^Retrato de una mujer$",
-        "^Retrato de un hombre$",
-        "^Kōtō shōgaku dokuhon$",
-        "^Shotōka kokugo$",
-        "^Shōgaku dokuhon$",
-        "^Jinjō shōgaku kokugo dokuhon$",
-        "^Shinsei kokugo dokuhon$",
-        "^Teikoku dokuhon$",
-        "^Instructions to Authors$",
-        "^KİTAP TAHLİLİ$",
-        "^PRZEGLĄD PIŚMIENNICTWA$",
-        "(?i)^Presentación$",
-        "^İçindekiler$",
-        "(?i)^Tabl?e of contents$",
-        "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
-        "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
-        "^Editorial( Board)?$",
-        "(?i)^Editorial \\(English\\)$",
-        "^Editörden$",
-        "^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
-        "^(Kiri Karl Morgensternile).*$",
-        "^(\\[Eksliibris Aleksandr).*\\]$",
-        "^(\\[Eksliibris Aleksandr).*$",
-        "^(Eksliibris Aleksandr).*$",
-        "^(Kiri A\\. de Vignolles).*$",
-        "^(2 kirja Karl Morgensternile).*$",
-        "^(Pirita kloostri idaosa arheoloogilised).*$",
-        "^(Kiri tundmatule).*$",
-        "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
-        "^(Eksliibris Nikolai Birukovile).*$",
-        "^(Eksliibris Nikolai Issakovile).*$",
-        "^(WHP Cruise Summary Information of section).*$",
-        "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
-        "^(Measurement of the spin\\-dependent structure function).*",
-        "(?i)^.*authors['’′]? reply\\.?$",
-        "(?i)^.*authors['’′]? response\\.?$",
-        "^Data [mM]anagement [sS]ervices\\.$",
-        "Research and Advanced Technology for Digital Libraries",
-        "Food and Nutrition"
-      ]
-    },
-    "synonyms": {}
-  }
-}
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/sw.tree.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/sw.tree.conf.json
@ -1,150 +0,0 @@
-{
-  "wf" : {
-    "threshold" : "0.99",
-    "dedupRun" : "001",
-    "entityType" : "result",
-    "subEntityType" : "resulttype",
-    "subEntityValue" : "software",
-    "orderField" : "title",
-    "queueMaxSize" : "200",
-    "groupMaxSize" : "100",
-    "maxChildren" : "100",
-    "slidingWindowSize" : "50",
-    "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
-    "includeChildren" : "true"
-  },
-  "pace" : {
-    "clustering" : [
-      { "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } },
-      { "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid":"0"} },
-      { "name" : "ngrams", "fields" : [ "title" ], "params" : {"ngramLen": 3, "max": 4, "maxPerToken":1, "minNgramLen":3}},
-      { "name" : "urlclustering", "fields": [ "url" ], "params" : {}}
-    ],
-    "decisionTree": {
-      "start": {
-        "fields": [
-          {
-            "field": "doi",
-            "comparator": "exactMatch",
-            "weight": 1,
-            "countIfUndefined": "false",
-            "params": {}
-          }
-        ],
-        "threshold": 1,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "titleCheck",
-        "undefined": "titleCheck",
-        "ignoreUndefined": "false"
-      },
-      "titleCheck": {
-        "fields": [
-          {
-            "field": "title",
-            "comparator": "levensteinTitleIgnoreVersion",
-            "weight": 1,
-            "countIfUndefined": "false",
-            "params": {}
-          }
-        ],
-        "threshold": 0.95,
-        "aggregation": "AVG",
-        "positive": "pidCheck",
-        "negative": "NO_MATCH",
-        "undefined": "NO_MATCH",
-        "ignoreUndefined": "false"
-      },
-      "pidCheck": {
-        "fields": [
-          {
-            "field": "altdoi",
-            "comparator": "exactMatch",
-            "weight": 1,
-            "countIfUndefined": "false",
-            "params": {}
-          },
-          {
-            "field": "doi",
-            "comparator": "exactMatch",
-            "weight": 1,
-            "countIfUndefined": "false",
-            "params": {"crossCompare": "altdoi"}
-          },
-          {
-            "field": "url",
-            "comparator": "exactMatch",
-            "weight": 1,
-            "countIfUndefined": "false",
-            "params": {}
-          }
-        ],
-        "threshold": 1,
-        "aggregation": "OR",
-        "positive": "MATCH",
-        "negative": "authorsCheck",
-        "undefined": "authorsCheck",
-        "ignoreUndefined": "false"
-      },
-      "authorsCheck": {
-        "fields": [
-          {
-            "field": "authors",
-            "comparator": "authorsMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {
-              "surname_th": 0.70,
-              "fullname_th": 0.70,
-              "size_th": 20,
-              "mode": "surname"
-            }
-          }
-        ],
-        "threshold": 1,
-        "aggregation": "AVG",
-        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "MATCH",
-        "ignoreUndefined": "false"
-      }
-    },
-    "model" : [
-      {
-        "name" : "doi",
-        "type" : "String",
-        "path" : "$.instance.pid[?(@.qualifier.classid == 'doi')].value"
-      },
-      {
-        "name" : "altdoi",
-        "type" : "String",
-        "path" : "$.instance.alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
-      },
-      {
-        "name" : "title",
-        "type" : "String",
-        "path" : "$.title[?(@.qualifier.classid == 'main title')].value",
-        "length" : 250,
-        "size" : 5
-      },
-      {
-        "name" : "url",
-        "type" : "String",
-        "path" : "$.instance.url"
-      },
-      {
-        "name" : "resulttype",
-        "type" : "String",
-        "path" : "$.resulttype.classid"
-      },
-      {
-        "name": "authors",
-        "type": "List",
-        "path": "$.author[*].fullname",
-        "size": 200
-      }
-    ],
-    "blacklists" : {},
-    "synonyms": {}
-  }
-}
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/authors.dump.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/authors.dump.json
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/ds.to.fix.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/ds.to.fix.json
@ -1,4 +0,0 @@
-{"websiteurl": "https://fairsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "eosc________::oxford_e-research_centre::oxford_e-research_centre.fairsharing"}
-{"websiteurl": "https://FAIRsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "fairsharing_::2521"}
-{"websiteurl": "https://fairsharing.org/", "englishname": "formerly: biosharing", "officialname": "FAIRsharing", "id": "re3data_____::r3d100010142"}
-{"websiteurl": "https://fairsharing.org/", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "openaire____::fairsharing"}
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/publications.to.fix.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/publications.to.fix.json
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/software.dump.2000.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/software.dump.2000.json
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/software.to.fix.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/software.to.fix.json
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/softwares.dump.2000.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/softwares.dump.2000.json
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/computeStatistics_parameters.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/computeStatistics_parameters.json
@ -1,32 +0,0 @@
-[
-  {
-    "paramName": "e",
-    "paramLongName": "entitiesPath",
-    "paramDescription": "the input entities",
-    "paramRequired": true
-  },
-  {
-    "paramName": "w",
-    "paramLongName": "workingPath",
-    "paramDescription": "path of the working directory",
-    "paramRequired": true
-  },
-  {
-    "paramName": "np",
-    "paramLongName": "numPartitions",
-    "paramDescription": "number of partitions for the similarity relations intermediate phases",
-    "paramRequired": false
-  },
-  {
-    "paramName": "dc",
-    "paramLongName": "dedupConfPath",
-    "paramDescription": "dedup configuration to be used",
-    "paramRequired": false
-  },
-  {
-    "paramName": "gt",
-    "paramLongName": "groundTruthFieldJPath",
-    "paramDescription": "field to be used as groundtruth",
-    "paramRequired": false
-  }
-]
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createDedupEntity_parameters.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createDedupEntity_parameters.json
--- a/dnet-pace-core/pom.xml
+++ b/dnet-pace-core/pom.xml
@ -6,7 +6,7 @@
 	<parent>
 		<groupId>eu.dnetlib</groupId>
 		<artifactId>dnet-dedup</artifactId>
-		<version>4.1.13-SNAPSHOT</version>
+		<version>4.1.12</version>
        <relativePath>../pom.xml</relativePath>
 	</parent>

--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java
@ -1,59 +1,59 @@
 package eu.dnetlib.pace.clustering;

-import com.google.common.collect.Maps;
-import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Document;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldListImpl;
-import eu.dnetlib.pace.model.MapDocument;
-
 import java.util.Collection;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
-import java.util.regex.Pattern;
+import java.util.Set;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Document;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldListImpl;
+import eu.dnetlib.pace.model.MapDocument;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;

 public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {

-    public static Collection<String> filterAndCombine(final MapDocument a, final Config conf) {
-        Document filtered = filter(a, conf.blacklists());
-        return combine(filtered, conf);
-    }
+	private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombiner.class);

-    private static MapDocument filter(final MapDocument a, final Map<String, List<Pattern>> blacklists) {
-        if (blacklists == null || blacklists.isEmpty()) {
-            return a;
-        }
+	public static Collection<String> filterAndCombine(final MapDocument a, final Config conf) {

-        final Map<String, Field> filtered = Maps.newHashMap(a.getFieldMap());
+		final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, conf.blacklists());
+		return combine(filtered, conf);
+	}

-        for (final Entry<String, List<Pattern>> e : blacklists.entrySet()) {
-            Field fields = a.getFieldMap().get(e.getKey());
-            if (fields != null) {
-                final FieldListImpl fl = new FieldListImpl();
+	private MapDocument filter(final MapDocument a, final Map<String, List<String>> blacklists) {
+		final Map<String, Field> filtered = Maps.newHashMap(a.getFieldMap());
+		if (blacklists != null) {
+			for (final Entry<String, Field> e : filtered.entrySet()) {

-                for (Field f : fields) {
-                    if (!isBlackListed(f.stringValue(), e.getValue())) {
-                        fl.add(f);
-                    }
-                }
-
-                filtered.put(e.getKey(), fl);
-            }
-        }
-
-        return new MapDocument(a.getIdentifier(), filtered);
-    }
-
-    private static boolean isBlackListed(String value, List<Pattern> blacklist) {
-        for (Pattern pattern : blacklist) {
-            if (pattern.matcher(value).matches()) {
-                return true;
-            }
-        }
-
-        return false;
-    }
+				final FieldListImpl fl = new FieldListImpl();
+				fl.addAll(Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists))));
+				filtered.put(e.getKey(), fl);
+			}
+		}
+		return new MapDocument(a.getIdentifier(), filtered);
+	}

+	/**
+	 * Tries to match the fields in the regex blacklist.
+	 *
+	 * @param fieldName
+	 * @param value
+	 * @return true if the field matches, false otherwise
+	 */
+	protected boolean regexMatches(final String fieldName, final String value, final Map<String, Set<String>> blacklists) {
+		if (blacklists.containsKey(fieldName)) {
+			for (final String regex : blacklists.get(fieldName)) {
+				if (value.matches(regex)) return true;
+			}
+		}
+		return false;
+	}
 }
-
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java
@ -20,6 +20,10 @@ public class ClusteringCombiner {
 	private static String COLLAPSE_ON= "collapseOn";

 	public static Collection<String> combine(final Document a, final Config conf) {
+		return new ClusteringCombiner().doCombine(a, conf);
+	}
+
+	private Collection<String> doCombine(final Document a, final Config conf) {
 		final Collection<String> res = Sets.newLinkedHashSet();
 		for (final ClusteringDef cd : conf.clusterings()) {
 			for (final String fieldName : cd.getFields()) {
@ -47,7 +51,7 @@ public class ClusteringCombiner {
 		return res;
 	}

-	private static String getPrefix(ClusteringDef cd, String fieldName) {
+	private String getPrefix(ClusteringDef cd, String fieldName) {
 		return cd.getName()+ SEPARATOR +
 				cd.getParams().keySet()
 						.stream()
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java
@ -0,0 +1,48 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.base.Predicate;
+
+import eu.dnetlib.pace.model.Field;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+public class FieldFilter implements Predicate<Field> {
+
+	private static final Log log = LogFactory.getLog(FieldFilter.class);
+
+	private Map<String, List<String>> blacklists;
+
+	private String filedName;
+
+	public FieldFilter(final String fieldName, final Map<String, List<String>> blacklists) {
+		this.filedName = fieldName;
+		this.blacklists = blacklists;
+	}
+
+	@Override
+	public boolean apply(final Field f) {
+		return !regexMatches(filedName, f.stringValue(), blacklists);
+	}
+
+	/**
+	 * Tries to match the fields in the regex blacklist.
+	 *
+	 * @param fieldName
+	 * @param value
+	 * @return true if the field matches, false otherwise
+	 */
+	protected boolean regexMatches(final String fieldName, final String value, final Map<String, List<String>> blacklists) {
+		if (blacklists.containsKey(fieldName)) {
+			final Iterable<String> regexes = blacklists.get(fieldName);
+			for (final String regex : regexes) {
+				if (StringUtils.isBlank(regex)) return false;
+				if (value.matches(regex)) return true;
+			}
+		}
+		return false;
+	}
+}
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
@ -1,77 +0,0 @@
-package eu.dnetlib.pace.clustering;
-
-import com.google.common.collect.Lists;
-import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.Person;
-import org.apache.commons.lang3.StringUtils;
-
-import java.util.*;
-import java.util.stream.Collectors;
-
-@ClusteringClass("lnfi")
-public class LastNameFirstInitial extends AbstractClusteringFunction{
-
-    private boolean DEFAULT_AGGRESSIVE = true;
-
-    public LastNameFirstInitial(final Map<String, Integer> params) {
-        super(params);
-    }
-
-    @Override
-    public Collection<String> apply(Config conf, List<Field> fields) {
-        return fields.stream().filter(f -> !f.isEmpty())
-                .map(Field::stringValue)
-                .map(this::normalize)
-                .map(s -> doApply(conf, s))
-                .map(c -> filterBlacklisted(c, ngramBlacklist))
-                .flatMap(c -> c.stream())
-                .filter(StringUtils::isNotBlank)
-                .collect(Collectors.toCollection(HashSet::new));
-    }
-
-    @Override
-    protected String normalize(final String s) {
-        return fixAliases(transliterate(nfd(unicodeNormalization(s))))
-                // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
-                .replaceAll("[^ \\w]+", "")
-                .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
-                .replaceAll("(\\p{Punct})+", " ")
-                .replaceAll("(\\d)+", " ")
-                .replaceAll("(\\n)+", " ")
-                .trim();
-    }
-
-    @Override
-    protected Collection<String> doApply(final Config conf, final String s) {
-
-        final List<String> res = Lists.newArrayList();
-
-        final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
-
-        Person p = new Person(s, aggressive);
-
-        if (p.isAccurate()) {
-            String lastName = p.getNormalisedSurname().toLowerCase();
-            String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
-
-            res.add(firstInitial.concat(lastName));
-        }
-        else {  // is not accurate, meaning it has no defined name and surname
-            List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
-            if (fullname.size() == 1) {
-                res.add(p.getNormalisedFullname().toLowerCase());
-            }
-            else if (fullname.size() == 2) {
-                res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
-                res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
-            }
-            else {
-                res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
-                res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
-            }
-        }
-
-        return res;
-    }
-}
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
@ -9,7 +9,7 @@ import com.google.common.collect.Lists;
 import eu.dnetlib.pace.config.Config;
 import eu.dnetlib.pace.model.Person;

-@ClusteringClass("personHash")
+@ClusteringClass("personhash")
 public class PersonHash extends AbstractClusteringFunction {

 	private boolean DEFAULT_AGGRESSIVE = false;
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@ -3,23 +3,28 @@ package eu.dnetlib.pace.common;
 import com.google.common.base.Joiner;
 import com.google.common.base.Splitter;
 import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
 import com.google.common.collect.Sets;
-import com.ibm.icu.text.Transliterator;
 import eu.dnetlib.pace.clustering.NGramUtils;
+import eu.dnetlib.pace.config.Type;
 import eu.dnetlib.pace.model.Field;
 import eu.dnetlib.pace.model.FieldList;
 import eu.dnetlib.pace.model.FieldListImpl;
+import eu.dnetlib.pace.model.FieldValueImpl;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;

 import java.io.IOException;
 import java.io.StringWriter;
+import java.io.UnsupportedEncodingException;
 import java.nio.charset.StandardCharsets;
 import java.text.Normalizer;
 import java.util.*;
+import java.util.function.Function;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
+import com.ibm.icu.text.Transliterator;

 /**
 * Set of common functions for the framework
@ -128,12 +133,10 @@ public abstract class AbstractPaceFunctions {

    protected static String fixAliases(final String s) {
        final StringBuilder sb = new StringBuilder();
-
-        s.chars().forEach(ch -> {
+        for (final char ch : Lists.charactersOf(s)) {
            final int i = StringUtils.indexOf(aliases_from, ch);
-            sb.append(i >= 0 ? aliases_to.charAt(i) : (char)ch);
-        });
-
+            sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
+        }
        return sb.toString();
    }

@ -149,10 +152,9 @@ public abstract class AbstractPaceFunctions {
    protected String removeSymbols(final String s) {
        final StringBuilder sb = new StringBuilder();

-        s.chars().forEach(ch -> {
-            sb.append(StringUtils.contains(alpha, ch) ? (char)ch : ' ');
-        });
-
+        for (final char ch : Lists.charactersOf(s)) {
+            sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
+        }
        return sb.toString().replaceAll("\\s+", " ");
    }

@ -239,7 +241,7 @@ public abstract class AbstractPaceFunctions {

        final Set<String> h = Sets.newHashSet();
        try {
-            for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
+            for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
                h.add(fixAliases(transliterator.transliterate(s))); //transliteration of the stopwords
            }
        } catch (final Throwable e) {
@ -254,7 +256,7 @@ public abstract class AbstractPaceFunctions {

        final Map<String, String> m = new HashMap<>();
        try {
-            for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
+            for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
                //string is like this: code;word1;word2;word3
                String[] line = s.split(";");
                String value = line[0];
@ -347,7 +349,7 @@ public abstract class AbstractPaceFunctions {
    public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
        final StringWriter sw = new StringWriter();
        try {
-            IOUtils.copy(clazz.getResourceAsStream(filename), sw, StandardCharsets.UTF_8);
+            IOUtils.copy(clazz.getResourceAsStream(filename), sw);
            return sw.toString();
        } catch (final IOException e) {
            throw new RuntimeException("cannot load resource from classpath: " + filename);
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
@ -2,7 +2,6 @@ package eu.dnetlib.pace.config;

 import java.util.List;
 import java.util.Map;
-import java.util.regex.Pattern;

 import eu.dnetlib.pace.model.ClusteringDef;
 import eu.dnetlib.pace.model.FieldDef;
@ -48,7 +47,7 @@ public interface Config {
 	 *
 	 * @return the map
 	 */
-	public Map<String, List<Pattern>> blacklists();
+	public Map<String, List<String>> blacklists();


 	/**
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
@ -1,6 +1,5 @@
 package eu.dnetlib.pace.config;

-import com.fasterxml.jackson.annotation.JsonIgnore;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.collect.Maps;
 import eu.dnetlib.pace.model.ClusteringDef;
@ -8,19 +7,15 @@ import eu.dnetlib.pace.model.FieldDef;
 import eu.dnetlib.pace.util.PaceException;
 import org.antlr.stringtemplate.StringTemplate;
 import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;

 import java.io.IOException;
 import java.io.Serializable;
-import java.nio.charset.StandardCharsets;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;


 import eu.dnetlib.pace.tree.support.TreeNodeDef;
@ -36,9 +31,6 @@ public class DedupConfig implements Config, Serializable {

 	private WfConfig wf;

-	@JsonIgnore
-	private Map<String, List<Pattern>> blacklists;
-
 	private static Map<String, String> defaults = Maps.newHashMap();

 	static {
@ -65,12 +57,6 @@ public class DedupConfig implements Config, Serializable {
 			config = new ObjectMapper().readValue(json, DedupConfig.class);
 			config.getPace().initModel();
 			config.getPace().initTranslationMap();
-
-			config.blacklists = config.getPace().getBlacklists().entrySet()
-					.stream()
-					.collect(Collectors.toMap(e -> e.getKey(),
-							e ->e.getValue().stream().filter(s -> !StringUtils.isBlank(s)).map(Pattern::compile).collect(Collectors.toList()) ));
-
 			return config;
 		} catch (IOException e) {
 			throw new PaceException("Error in parsing configuration json", e);
@ -102,7 +88,7 @@ public class DedupConfig implements Config, Serializable {
 	}

 	private String readFromClasspath(final String resource) throws IOException {
-		return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8);
+		return IOUtils.toString(getClass().getResource(resource));
 	}

 	public PaceConfig getPace() {
@ -151,8 +137,8 @@ public class DedupConfig implements Config, Serializable {
 	}

 	@Override
-	public Map<String, List<Pattern>> blacklists() {
-		return blacklists;
+	public Map<String, List<String>> blacklists() {
+		return getPace().getBlacklists();
 	}

 	@Override
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java
@ -1,5 +1,5 @@
 package eu.dnetlib.pace.config;

 public enum Type {
-	String, Int, List, JSON, URL, StringConcat, DoubleArray
+	String, Int, List, JSON, URL, StringConcat
 }
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java
@ -20,6 +20,4 @@ public interface FieldValue extends Field {
 	 */
 	public void setValue(final Object value);

-	public double[] doubleArrayValue();
-
 }
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java
@ -58,10 +58,8 @@ public class FieldValueImpl extends AbstractField implements FieldValue {
 					throw new RuntimeException(value.toString());
 				}
 			case URL:
-				String str = value.toString();
-				return StringUtils.isBlank(str) || !isValidURL(str);
-			case DoubleArray:
-				return doubleArrayValue().length==0;
+			String str = value.toString();
+			return StringUtils.isBlank(str) || !isValidURL(str);
 		default:
 			return true;
 		}
@ -118,10 +116,6 @@ public class FieldValueImpl extends AbstractField implements FieldValue {
 		// }
 	}

-	public double[] doubleArrayValue() {
-		return (double[])getValue();
-	}
-
 	/*
 	 * (non-Javadoc)
 	 * 
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
@ -43,7 +43,7 @@ public class Person {
 			// s = s.replaceAll("[\\W&&[^,-]]", "");
 		}

-		if (s.contains(",")) {	//if the name contains a comma it is easy derivable the name and the surname
+		if (s.contains(",")) {
 			final String[] arr = s.split(",");
 			if (arr.length == 1) {
 				fullname = splitTerms(arr[0]);
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
@ -26,7 +26,6 @@ public class AuthorsMatch extends AbstractComparator {
    private double FULLNAME_THRESHOLD;
    private String MODE; //full or surname
    private int SIZE_THRESHOLD;
-    private String TYPE; //count or percentage
    private int common;

    public AuthorsMatch(Map<String, String> params){
@ -38,7 +37,6 @@ public class AuthorsMatch extends AbstractComparator {
        NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
        FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
        SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
-        TYPE = params.getOrDefault("type", "percentage");
        common = 0;
    }

@ -52,7 +50,7 @@ public class AuthorsMatch extends AbstractComparator {
        if (a.isEmpty() || b.isEmpty())
            return -1;

-        if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) b).size() > SIZE_THRESHOLD)
+        if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) a).size() > SIZE_THRESHOLD)
            return 1.0;

        List<Person> aList = ((FieldList) a).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
@ -125,12 +123,7 @@ public class AuthorsMatch extends AbstractComparator {
        //normalization factor to compute the score
        int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);

-        if(TYPE.equals("percentage")) {
-            return (double) common / normFactor;
-        }
-        else {
-            return (double) common;
-        }
+        return (double)common / normFactor;
    }

    public boolean compareSurname(Person p1, Person p2) {
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java
@ -1,53 +0,0 @@
-package eu.dnetlib.pace.tree;
-
-import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldList;
-import eu.dnetlib.pace.model.FieldValueImpl;
-import eu.dnetlib.pace.model.Person;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
-import eu.dnetlib.pace.tree.support.ComparatorClass;
-
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.stream.Collectors;
-
-@ComparatorClass("cosineSimilarity")
-public class CosineSimilarity extends AbstractComparator {
-
-    Map<String, String> params;
-
-    public CosineSimilarity(Map<String,String> params) {
-        super(params);
-    }
-
-    @Override
-    public double compare(final Field a, final Field b, final Config conf) {
-
-        if (a.isEmpty() || b.isEmpty())
-            return -1;
-
-        double[] aVector = ((FieldValueImpl) a).doubleArrayValue();
-        double[] bVector = ((FieldValueImpl) b).doubleArrayValue();
-
-        return cosineSimilarity(aVector, bVector);
-    }
-
-    double cosineSimilarity(double[] a, double[] b) {
-        double dotProduct = 0;
-        double normASum = 0;
-        double normBSum = 0;
-
-        for(int i = 0; i < a.length; i ++) {
-            dotProduct += a[i] * b[i];
-            normASum += a[i] * a[i];
-            normBSum += b[i] * b[i];
-        }
-
-        double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum);
-        return dotProduct / eucledianDist;
-    }
-
-
-}
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java
@ -16,7 +16,6 @@ public class DomainExactMatch extends ExactMatchIgnoreCase {

    @Override
    protected String getValue(final Field f) {
-
        try {
            return asUrl(super.getValue(f)).getHost();
        } catch (MalformedURLException e) {
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java
@ -1,34 +0,0 @@
-package eu.dnetlib.pace.tree;
-
-import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
-import eu.dnetlib.pace.tree.support.ComparatorClass;
-
-import java.util.Map;
-
-@ComparatorClass("numbersComparator")
-public class NumbersComparator extends AbstractComparator {
-
-    Map<String, String> params;
-
-    public NumbersComparator(Map<String, String> params) {
-        super(params);
-        this.params = params;
-    }
-
-    @Override
-    public double distance(String a, String b, Config conf) {
-
-        //extracts numbers from the field
-        String numbers1 = getNumbers(nfd(a));
-        String numbers2 = getNumbers(nfd(b));
-
-        if (numbers1.isEmpty() || numbers2.isEmpty())
-            return -1.0;
-
-        int n1 = Integer.parseInt(numbers1);
-        int n2 = Integer.parseInt(numbers2);
-
-        return Math.abs(n1 - n2);
-    }
-}
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java
@ -42,25 +42,22 @@ public class StringContainsMatch extends AbstractComparator {
            STRING = STRING.toLowerCase();
        }

-        if (AGGREGATOR != null) {
-            switch (AGGREGATOR) {
-                case "AND":
-                    if (ca.contains(STRING) && cb.contains(STRING))
-                        return 1.0;
-                    break;
-                case "OR":
-                    if (ca.contains(STRING) || cb.contains(STRING))
-                        return 1.0;
-                    break;
-                case "XOR":
-                    if (ca.contains(STRING) ^ cb.contains(STRING))
-                        return 1.0;
-                    break;
-                default:
-                    return 0.0;
-            }
+        switch(AGGREGATOR) {
+            case "AND":
+                if(ca.contains(STRING) && cb.contains(STRING))
+                    return 1.0;
+                break;
+            case "OR":
+                if(ca.contains(STRING) || cb.contains(STRING))
+                    return 1.0;
+                break;
+            case "XOR":
+                if(ca.contains(STRING) ^ cb.contains(STRING))
+                    return 1.0;
+                break;
+            default:
+                return 0.0;
        }
-
        return 0.0;
    }
 }
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java
@ -19,13 +19,9 @@ public class StringListMatch extends AbstractComparator {
    private static final Log log = LogFactory.getLog(StringListMatch.class);
    private Map<String, String> params;

-    final private String TYPE; //percentage or count
-
    public StringListMatch(final Map<String, String> params) {
        super(params);
        this.params = params;
-
-        TYPE = params.getOrDefault("type", "percentage");
    }

    @Override
@ -35,7 +31,7 @@ public class StringListMatch extends AbstractComparator {
        final Set<String> pb = new HashSet<>(((FieldList) b).stringList());

        if (pa.isEmpty() || pb.isEmpty()) {
-            return -1;  //return undefined if one of the two lists is empty
+            return -1;  //return undefined if one of the two lists of pids is empty
        }

        int incommon = Sets.intersection(pa, pb).size();
@ -45,10 +41,7 @@ public class StringListMatch extends AbstractComparator {
            return 0.0;
        }

-        if(TYPE.equals("percentage"))
-            return (double)incommon / (incommon + simDiff);
-        else
-            return incommon;
+        return (double)incommon / (incommon + simDiff);

    }
 }
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java
@ -1,6 +1,5 @@
 package eu.dnetlib.pace.tree.support;

-import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.pace.config.Config;
 import eu.dnetlib.pace.config.PaceConfig;
@ -10,7 +9,6 @@ import org.apache.commons.lang3.StringUtils;

 import java.io.IOException;
 import java.io.Serializable;
-import java.io.StringWriter;
 import java.util.List;

 public class TreeNodeDef implements Serializable {
@ -59,9 +57,8 @@ public class TreeNodeDef implements Serializable {
                double result2 = comparator(fieldConf).compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf);
                result = Math.max(result1,result2);
            }
-            else {
+            else
                result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
-            }

            stats.addFieldStats(
                    fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java
@ -161,14 +161,11 @@ public class BlockProcessorForTesting {
                            }
                            else {
                                //use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications)
-                                if (useTree)
+                                if(useTree)
                                    emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
                                else
                                    emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
                            }
-//                            if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) {
-//                                emitOutput(true, idPivot, idCurr, context);
-//                            }

                        }
                    }
@ -183,45 +180,38 @@ public class BlockProcessorForTesting {
        return compare>=1.0;
    }

-    private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
-        //if the score gives 1, the publications are equivalent
-        Map<String, String> params = new HashMap<>();
-        params.put("jpath_value", "$.value");
-        params.put("jpath_classid", "$.qualifier.classid");
-        params.put("mode", "count");
+        private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {

-        double score = 0.0;
+            double score = 0.0;
+            //LAYER 1 - comparison of the PIDs json lists
+            Map<String, String> params = new HashMap<>();
+            params.put("jpath_value", "$.value");
+            params.put("jpath_classid", "$.qualifier.classid");
+            JsonListMatch jsonListMatch = new JsonListMatch(params);
+            double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config);
+            if (result >= 0.5) //if the result of the comparison is greater than the threshold
+                score += 10.0;  //high score because it should match when the first condition is satisfied
+            else
+                score += 0.0;

-        //levenstein title
-        LevensteinTitle levensteinTitle = new LevensteinTitle(params);
-        if(levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) {
-            score += 0.2;
+            //LAYER 2 - comparison of the title version and the size of the authors lists
+            TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
+            double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
+            SizeMatch sizeMatch = new SizeMatch(params);
+            double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
+            if (Math.min(result1, result2) != 0)
+                score+=0;
+            else
+                score-=2;
+
+            //LAYER 3 - computation of levenshtein on titles
+            LevensteinTitle levensteinTitle = new LevensteinTitle(params);
+            double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
+            score += Double.isNaN(result3)?0.0:result3;
+
+            return score >= 0.99;
        }

-        //pid
-        JsonListMatch jsonListMatch = new JsonListMatch(params);
-        if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) {
-            score += 0.5;
-        }
-
-        //title version
-        TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
-        double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
-        if(result1<0 || result1>=1.0) {
-            score += 0.1;
-        }
-
-        //authors match
-        params.remove("mode");
-        AuthorsMatch authorsMatch = new AuthorsMatch(params);
-        double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
-        if(result2 <0|| result2>=0.6) {
-            score += 0.2;
-        }
-
-        return score>=0.5;
-    }
-
        private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context)  {

            if (result) {
@ -244,5 +234,6 @@ public class BlockProcessorForTesting {
            final String type = dedupConf.getWf().getEntityType();

            context.emit(type, from, to);
+            context.emit(type, to, from);
        }
 }
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java
@ -7,10 +7,12 @@ import com.jayway.jsonpath.JsonPath;
 import com.jayway.jsonpath.Option;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.config.Type;
-import eu.dnetlib.pace.model.*;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldListImpl;
+import eu.dnetlib.pace.model.FieldValueImpl;
+import eu.dnetlib.pace.model.MapDocument;
 import net.minidev.json.JSONArray;

-import java.math.BigDecimal;
 import java.util.*;
 import java.util.function.Predicate;
 import java.util.stream.Collectors;
@ -44,14 +46,6 @@ public class MapDocumentUtil {
                            .forEach(fi::add);
                    stringField.put(fdef.getName(), fi);
                    break;
-                case DoubleArray:
-                    stringField.put(
-                            fdef.getName(),
-                            new FieldValueImpl(Type.DoubleArray,
-                                    fdef.getName(),
-                                    getJPathArray(fdef.getPath(), json))
-                    );
-                    break;
                case StringConcat:
                    String[] jpaths = fdef.getPath().split("\\|\\|\\|");
                    stringField.put(
@ -121,30 +115,6 @@ public class MapDocumentUtil {
        }
    }

-    public static double[] getJPathArray(final String jsonPath, final String json) {
-        try {
-            Object o = JsonPath.read(json, jsonPath);
-            if (o instanceof double[])
-                return (double[]) o;
-            if (o instanceof JSONArray) {
-                Object[] objects = ((JSONArray) o).toArray();
-                double[] array = new double[objects.length];
-                for (int i = 0; i < objects.length; i++) {
-                    if (objects[i] instanceof BigDecimal)
-                        array[i] = ((BigDecimal)objects[i]).doubleValue();
-                    else
-                        array[i] = (double) objects[i];
-                }
-                return array;
-            }
-            return new double[0];
-        }
-        catch (Exception e) {
-            e.printStackTrace();
-            return new double[0];
-        }
-    }
-

    public static String truncateValue(String value, int length) {
        if (value == null)
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java
@ -9,7 +9,6 @@ import org.apache.commons.io.IOUtils;

 import java.io.IOException;
 import java.io.StringWriter;
-import java.nio.charset.StandardCharsets;
 import java.util.List;
 import java.util.stream.Collectors;

@ -18,7 +17,7 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions {
 	protected String readFromClasspath(final String filename) {
 		final StringWriter sw = new StringWriter();
 		try {
-			IOUtils.copy(getClass().getResourceAsStream(filename), sw, StandardCharsets.UTF_8);
+			IOUtils.copy(getClass().getResourceAsStream(filename), sw);
 			return sw.toString();
 		} catch (final IOException e) {
 			throw new RuntimeException("cannot load resource from classpath: " + filename);
@ -37,10 +36,6 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions {
 		return new FieldValueImpl(Type.URL, "url", s);
 	}

-	protected Field array(final double[] a) {
-		return new FieldValueImpl(Type.DoubleArray, "array", a);
-	}
-
 	protected Field createFieldList(List<String> strings, String fieldName){

 		List<FieldValueImpl> fieldValueStream = strings.stream().map(s -> new FieldValueImpl(Type.String, fieldName, s)).collect(Collectors.toList());
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
@ -103,11 +103,6 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
 		final String s = "Search for the Standard Model Higgs Boson";
 		System.out.println(s);
 		System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
-
-		params.put("len", 3);
-		params.put("max", 1);
-
-		System.out.println(sp.apply(conf, Lists.newArrayList(title("Framework for general-purpose deduplication"))));
 	}

 	@Test
@ -153,10 +148,6 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
 		System.out.println(s);
 		System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));

-		s = "niivue/niivue: 0.21.1";
-		System.out.println(s);
-		System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
-
 	}

 	@Test
@ -209,41 +200,4 @@ public class ClusteringFunctionTest extends AbstractPaceTest {

 	}

-	@Test
-	public void testPersonClustering(){
-
-		final ClusteringFunction cf = new PersonClustering(params);
-		final String s = "Abd-Alla, Abo-el-nour N.";
-		System.out.println("s = " + s);
-		System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
-
-		final String s1 = "Manghi, Paolo";
-		System.out.println("s1 = " + s1);
-		System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
-
-	}
-
-	@Test
-	public void testPersonHash(){
-
-		final ClusteringFunction cf = new PersonHash(params);
-		final String s = "Manghi, Paolo";
-		System.out.println("s = " + s);
-		System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
-
-		final String s1 = "Manghi, P.";
-		System.out.println("s = " + s1);
-		System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
-
-	}
-
-	@Test
-	public void testLastNameFirstInitial(){
-
-		final ClusteringFunction cf = new LastNameFirstInitial(params);
-		final String s = "LI Yonghong";
-		System.out.println("s = " + s);
-		System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
-	}
-
-}
+}
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
@ -2,16 +2,13 @@ package eu.dnetlib.pace.comparators;

 import eu.dnetlib.pace.AbstractPaceTest;
 import eu.dnetlib.pace.clustering.NGramUtils;
-import eu.dnetlib.pace.config.Type;
 import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldValueImpl;
 import eu.dnetlib.pace.tree.*;
 import eu.dnetlib.pace.config.DedupConfig;

 import org.junit.jupiter.api.*;
 import static org.junit.jupiter.api.Assertions.assertEquals;

-import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
@ -24,20 +21,15 @@ public class ComparatorTest extends AbstractPaceTest {

 	@BeforeAll
 	public void setup() {
-		conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
-	}
-
-	@BeforeEach
-	public void beforeEachTest() {
 		params = new HashMap<>();
 		params.put("weight", "1.0");
 		params.put("surname_th", "0.99");
 		params.put("name_th", "0.95");
 		params.put("jpath_value", "$.value");
 		params.put("jpath_classid", "$.qualifier.classid");
+		conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
 	}

-
 	@Test
 	public void testCleanForSorting() {
 		NGramUtils utils = new NGramUtils();
@ -64,10 +56,7 @@ public class ComparatorTest extends AbstractPaceTest {
 		//particular cases
 		assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
 		assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf));
-
-		// failing becasuse 'Allen' is a transliterrated greek stopword
-		// assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
-		assertEquals(-1.0, cityMatch.distance("Washington (United States)", "United States Military Academy", conf));
+		assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
 	}

 	@Test
@ -81,7 +70,7 @@ public class ComparatorTest extends AbstractPaceTest {
 		assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
 		assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
 		assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
-		assertEquals(2.0/3.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
+		assertEquals(0.5, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
 		assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf));
 		assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf));
 		assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
@ -115,7 +104,7 @@ public class ComparatorTest extends AbstractPaceTest {
 	public void stringContainsMatchTest(){

 		params.put("string", "openorgs");
-		params.put("aggregator", "XOR");
+		params.put("bool", "XOR");
 		params.put("caseSensitive", "false");

 		StringContainsMatch stringContainsMatch = new StringContainsMatch(params);
@ -123,7 +112,7 @@ public class ComparatorTest extends AbstractPaceTest {
 		assertEquals(0.0, stringContainsMatch.distance("openorgs", "openorgs", conf));

 		params.put("string", "openorgs");
-		params.put("aggregator", "AND");
+		params.put("bool", "AND");
 		params.put("caseSensitive", "false");

 		stringContainsMatch = new StringContainsMatch(params);
@ -257,10 +246,6 @@ public class ComparatorTest extends AbstractPaceTest {

 		assertEquals(0.25, result);

-		Field f = createFieldList(new ArrayList<>(), "authors");
-		result = authorsMatch.compare(f,f, conf);
-		System.out.println("result = " + result);
-
 	}

 	@Test
@ -282,30 +267,5 @@ public class ComparatorTest extends AbstractPaceTest {
 		assertEquals(1.0, result);
 	}

-	@Test
-	public void domainExactMatch() {
-
-		DomainExactMatch domainExactMatch = new DomainExactMatch(params);
-		Field a = url("http://www.flowrepository.org");
-		Field b = url("http://flowrepository.org/");
-
-		double compare = domainExactMatch.compare(a, b, conf);
-		System.out.println("compare = " + compare);
-
-	}
-
-	@Test
-	public void cosineSimilarity() {
-
-		CosineSimilarity cosineSimilarity = new CosineSimilarity(params);
-
-		Field a = new FieldValueImpl(Type.DoubleArray, "array", new double[]{1,2,3});
-		Field b = new FieldValueImpl(Type.DoubleArray, "array", new double[]{1,2,3});
-
-		double compare = cosineSimilarity.compare(a, b, conf);
-
-		System.out.println("compare = " + compare);
-	}
-

 }
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java
@ -7,7 +7,6 @@ import eu.dnetlib.pace.clustering.ClusteringClass;
 import eu.dnetlib.pace.clustering.ClusteringCombiner;
 import eu.dnetlib.pace.model.Field;
 import eu.dnetlib.pace.model.FieldList;
-import eu.dnetlib.pace.model.FieldValue;
 import eu.dnetlib.pace.model.MapDocument;
 import eu.dnetlib.pace.tree.JsonListMatch;
 import eu.dnetlib.pace.tree.support.AggType;
@ -21,7 +20,10 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;

-import java.util.*;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
 import java.util.stream.Collectors;


@ -83,7 +85,7 @@ public class ConfigTest extends AbstractPaceTest {
 	}

 	@Test
-	public void asMapDocumentTest1() {
+	public void asMapDocumentTest() {

 		DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));

@ -101,19 +103,6 @@ public class ConfigTest extends AbstractPaceTest {

    }

-	@Test
-	public void authorAsMapDocument() {
-
-		DedupConfig dedupConf = DedupConfig.load(readFromClasspath("author.fdup.conf.json"));
-
-		final String json = readFromClasspath("author.json");
-
-		final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
-
-		System.out.println("mapDocument = " + Arrays.toString(((FieldValue) mapDocument.getFieldMap().get("topics")).doubleArrayValue()));
-
-	}
-
    @Test
    public  void testJPath()  {
        final String json = readFromClasspath("organization.json");
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
@ -1,6 +1,7 @@
 package eu.dnetlib.pace.util;

 import eu.dnetlib.pace.model.Person;
+import jdk.nashorn.internal.ir.annotations.Ignore;
 import org.junit.jupiter.api.*;

 import java.util.HashMap;
@ -17,6 +18,7 @@ public class UtilTest {
    }

    @Test
+    @Ignore
    public void paceResolverTest() {
        PaceResolver paceResolver = new PaceResolver();
        paceResolver.getComparator("keywordMatch", params);
@ -28,11 +30,6 @@ public class UtilTest {

        assertEquals("kennedy", p.getSurnameString());
        assertEquals("j f", p.getNameString());
-
-        p = new Person("Guan-Hua Du", false);
-
-        System.out.println("surname = " + p.getSurnameString());
-        System.out.println("name = " + p.getNameString());
    }

 }
--- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.fdup.conf.json
+++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.fdup.conf.json
@ -1,134 +0,0 @@
-{
-  "wf": {
-    "threshold": "0.99",
-    "dedupRun": "001",
-    "entityType": "author",
-    "subEntityType": "author",
-    "subEntityValue": "author",
-    "orderField": "fullname",
-    "queueMaxSize": "200",
-    "groupMaxSize": "100",
-    "maxChildren": "100",
-    "slidingWindowSize": "50",
-    "rootBuilder": [
-      "result",
-      "resultProject_outcome_isProducedBy",
-      "resultResult_publicationDataset_isRelatedTo",
-      "resultResult_similarity_isAmongTopNSimilarDocuments",
-      "resultResult_similarity_hasAmongTopNSimilarDocuments",
-      "resultOrganization_affiliation_isAffiliatedWith",
-      "resultResult_part_hasPart",
-      "resultResult_part_isPartOf",
-      "resultResult_supplement_isSupplementTo",
-      "resultResult_supplement_isSupplementedBy",
-      "resultResult_version_isVersionOf"
-    ],
-    "includeChildren": "true",
-    "maxIterations": 20,
-    "idPath": "$.id"
-  },
-  "pace": {
-    "clustering" : [
-      { "name" : "lnfi", "fields" : [ "fullname" ], "params" : {} }
-    ],
-    "decisionTree": {
-      "start": {
-        "fields": [
-          {
-            "field": "orcid",
-            "comparator": "exactMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "orcids",
-        "ignoreUndefined": "true"
-      },
-      "orcids": {
-        "fields": [
-          {
-            "field": "orcids",
-            "comparator": "stringListMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {"type": "count"}
-          }
-        ],
-        "threshold": 3.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "coauthors",
-        "undefined": "coauthors",
-        "ignoreUndefined": "true"
-      },
-      "coauthors": {
-        "fields": [
-          {
-            "field": "coauthors",
-            "comparator": "authorsMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {"type": "count"}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "topicsMatch",
-        "negative": "NO_MATCH",
-        "undefined": "topicsMatch",
-        "ignoreUndefined": "true"
-      },
-      "topicsMatch": {
-        "fields": [
-          {
-            "field": "topics",
-            "comparator": "cosineSimilarity",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "NO_MATCH",
-        "ignoreUndefined": "false"
-      }
-    },
-    "model": [
-      {
-        "name": "topics",
-        "type": "DoubleArray",
-        "path": "$.topics"
-      },
-      {
-        "name": "fullname",
-        "type": "String",
-        "path": "$.fullname"
-      },
-      {
-        "name": "orcid",
-        "type": "String",
-        "path": "$.orcid"
-      },
-      {
-        "name": "coauthors",
-        "type": "List",
-        "path": "$.coAuthors[*].fullname"
-      },
-      {
-        "name": "orcids",
-        "type": "List",
-        "path": "$.coAuthors[*].orcid"
-      }
-    ],
-    "blacklists": {},
-    "synonyms": {}
-  }
-}
--- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.json
+++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.json
@ -1 +0,0 @@
-{"fullname":"Zaragoza, Maria Cleofé","firstname":"Maria Cleofé","lastname":"Zaragoza","coAuthors":[{"fullname":"Cambras, Trinitat","lastname":"Cambras","firstname":"Trinitat","orcid":"0000-0002-9009-4690"},{"fullname":"Castro-Marrero, Jesús","lastname":"Castro-Marrero","firstname":"Jesús","orcid":""},{"fullname":"Díez-Noguera, Antoni","lastname":"Díez-Noguera","firstname":"Antoni","orcid":""},{"fullname":"Alegre, José","lastname":"Alegre","firstname":"José","orcid":"0000-0002-7582-7585"}],"topics":[0.9522090839562252,0.04779091604377485],"orcid":"0000-0002-9797-0219","id":"author::1a10826c83c7f9f0dcebe7df05e37a2a","pubId":"50|pmid________::db7fd19db5a620eafad40cfb97f9690d"}
--- a/pom.xml
+++ b/pom.xml
@ -5,7 +5,7 @@

    <groupId>eu.dnetlib</groupId>
    <artifactId>dnet-dedup</artifactId>
-    <version>4.1.13-SNAPSHOT</version>
+    <version>4.1.12</version>

    <packaging>pom</packaging>

@ -22,7 +22,7 @@

    <scm>
        <developerConnection>scm:git:https://code-repo.d4science.org/D-Net/dnet-dedup.git</developerConnection>
-        <tag>dnet-dedup-4.0.3</tag>
+        <tag>dnet-dedup-4.1.12</tag>
    </scm>

    <modules>
@ -144,7 +144,14 @@
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-surefire-plugin</artifactId>
-                    <version>2.22.0</version>
+                    <version>2.19.1</version>
+                    <dependencies>
+                        <dependency>
+                            <groupId>org.junit.jupiter</groupId>
+                            <artifactId>junit-jupiter</artifactId>
+                            <version>${junit-jupiter.version}</version>
+                        </dependency>
+                    </dependencies>
                    <configuration>
                        <redirectTestOutputToFile>false</redirectTestOutputToFile>
                    </configuration>
@ -254,7 +261,7 @@
        <oozie.use.system.libpath>true</oozie.use.system.libpath>
        <properties.maven.plugin.version>2.0.1</properties.maven.plugin.version>
        <junit-jupiter.version>5.6.1</junit-jupiter.version>
-        <maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>../dhp-build/dhp-build-assembly-resources/target/dhp-build-assembly-resources-${project.version}.jar</maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>
+        <maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>../dhp-build/dhp-build-assembly-resources/target/dhp-build-assembly-resources-4.1.8-SNAPSHOT.jar</maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>

    </properties>

@ -403,12 +410,27 @@
                <version>2.4.0</version>
            </dependency>

+            <dependency>
+                <groupId>org.mockito</groupId>
+                <artifactId>mockito-core</artifactId>
+                <version>3.3.3</version>
+                <scope>test</scope>
+            </dependency>
+
+            <dependency>
+                <groupId>org.mockito</groupId>
+                <artifactId>mockito-junit-jupiter</artifactId>
+                <version>3.3.3</version>
+                <scope>test</scope>
+            </dependency>
+
            <dependency>
                <groupId>com.ibm.icu</groupId>
                <artifactId>icu4j</artifactId>
                <version>70.1</version>
            </dependency>

+
        </dependencies>

    </dependencyManagement>
				`@ -1 +0,0 @@`
				{"fullname":"Zaragoza, Maria Cleofé","firstname":"Maria Cleofé","lastname":"Zaragoza","coAuthors":[{"fullname":"Cambras, Trinitat","lastname":"Cambras","firstname":"Trinitat","orcid":"0000-0002-9009-4690"},{"fullname":"Castro-Marrero, Jesús","lastname":"Castro-Marrero","firstname":"Jesús","orcid":""},{"fullname":"Díez-Noguera, Antoni","lastname":"Díez-Noguera","firstname":"Antoni","orcid":""},{"fullname":"Alegre, José","lastname":"Alegre","firstname":"José","orcid":"0000-0002-7582-7585"}],"topics":[0.9522090839562252,0.04779091604377485],"orcid":"0000-0002-9797-0219","id":"author::1a10826c83c7f9f0dcebe7df05e37a2a","pubId":"50\|pmid________::db7fd19db5a620eafad40cfb97f9690d"}