75 changed files with 2531 additions and 9062 deletions
--- a/.gitignore
+++ b/.gitignore
@ -19,5 +19,3 @@
 /build
 spark-warehouse
 /dhp-workflows/dhp-graph-mapper/job-override.properties
-test.properties
-
--- a/dhp-build/dhp-build-assembly-resources/pom.xml
+++ b/dhp-build/dhp-build-assembly-resources/pom.xml
@ -6,7 +6,7 @@
    <parent>
        <groupId>eu.dnetlib</groupId>
        <artifactId>dhp-build</artifactId>
-        <version>4.1.13-SNAPSHOT</version>
+        <version>4.1.9</version>
    </parent>

    <artifactId>dhp-build-assembly-resources</artifactId>
--- a/dhp-build/dhp-build-properties-maven-plugin/pom.xml
+++ b/dhp-build/dhp-build-properties-maven-plugin/pom.xml
@ -6,11 +6,10 @@
    <parent>
        <groupId>eu.dnetlib</groupId>
        <artifactId>dhp-build</artifactId>
-        <version>4.1.13-SNAPSHOT</version>
+        <version>4.1.9</version>
    </parent>

    <artifactId>dhp-build-properties-maven-plugin</artifactId>
-    <version>4.1.13-SNAPSHOT</version>
    <packaging>maven-plugin</packaging>

    <description>This module is a maven plugin implementing custom properties substitutions in the build lifecycle</description>
@ -20,19 +19,16 @@
            <groupId>org.apache.maven</groupId>
            <artifactId>maven-plugin-api</artifactId>
            <version>3.6.3</version>
-            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.maven</groupId>
            <artifactId>maven-project</artifactId>
            <version>2.2.1</version>
-            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.maven</groupId>
            <artifactId>maven-artifact</artifactId>
            <version>2.2.1</version>
-            <scope>provided</scope>
        </dependency>

        <dependency>
@ -104,29 +100,6 @@
                </configuration>
            </plugin>
        </plugins>
-
-        <pluginManagement>
-            <plugins>
-                <plugin>
-                    <groupId>org.apache.maven.plugins</groupId>
-                    <artifactId>maven-plugin-plugin</artifactId>
-                    <version>3.2</version>
-                    <configuration>
-                        <skipErrorNoDescriptorsFound>true</skipErrorNoDescriptorsFound>
-                    </configuration>
-                    <executions>
-                        <execution>
-                            <id>mojo-descriptor</id>
-                            <phase>process-classes</phase>
-                            <goals>
-                                <goal>descriptor</goal>
-                            </goals>
-                        </execution>
-                    </executions>
-                </plugin>
-            </plugins>
-        </pluginManagement>
-
    </build>

 </project>
--- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java
+++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java
@ -8,8 +8,6 @@ import static org.junit.jupiter.api.Assertions.assertNull;

 import org.junit.jupiter.api.*;

-import java.nio.file.Paths;
-
 /** @author mhorst, claudio.atzori */
 public class GenerateOoziePropertiesMojoTest {

@ -68,7 +66,7 @@ public class GenerateOoziePropertiesMojoTest {
 		clearSystemProperties();

 		// given
-		String workflowSourceDir = Paths.get("eu/dnetlib/dhp/").toString();
+		String workflowSourceDir = "eu/dnetlib/dhp/";
 		System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);

 		// execute
@ -83,14 +81,14 @@ public class GenerateOoziePropertiesMojoTest {

 		clearSystemProperties();
 		// given
-		String workflowSourceDir = Paths.get("eu/dnetlib/dhp/wf/transformers").toString();
+		String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers";
 		System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);

 		// execute
 		mojo.execute();

 		// assert
-		assertEquals(Paths.get("wf/transformers").toString(), System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
+		assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
 	}

 	@Test
@ -98,13 +96,13 @@ public class GenerateOoziePropertiesMojoTest {

 		clearSystemProperties();
 		// given
-		String workflowSourceDir = Paths.get("wf/transformers").toString();
+		String workflowSourceDir = "wf/transformers";
 		System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);

 		// execute
 		mojo.execute();

 		// assert
-		assertEquals(Paths.get("wf/transformers").toString(), System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
+		assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
 	}
 }
--- a/dhp-build/dhp-build-properties-maven-plugin/test.properties
+++ b/dhp-build/dhp-build-properties-maven-plugin/test.properties
@ -0,0 +1 @@
+# Thu Dec 30 13:11:51 CET 2021
--- a/dhp-build/dhp-code-style/pom.xml
+++ b/dhp-build/dhp-code-style/pom.xml
@ -5,7 +5,7 @@

    <groupId>eu.dnetlib</groupId>
    <artifactId>dhp-code-style</artifactId>
-    <version>4.1.13-SNAPSHOT</version>
+    <version>4.1.9</version>

    <packaging>jar</packaging>

--- a/dhp-build/pom.xml
+++ b/dhp-build/pom.xml
@ -4,7 +4,7 @@
 	<parent>
 		<groupId>eu.dnetlib</groupId>
 		<artifactId>dnet-dedup</artifactId>
-		<version>4.1.13-SNAPSHOT</version>
+		<version>4.1.9</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 	<artifactId>dhp-build</artifactId>
--- a/dnet-dedup-test/job-override.properties
+++ b/dnet-dedup-test/job-override.properties
@ -1,6 +1,6 @@
-useTree = true
-entitiesPath = /user/michele.debonis/lda_experiments/authors_pubmed
-workingPath = /user/michele.debonis/authors_dedup/gt2_dedup
-numPartitions = 1000
-dedupConfPath = /user/michele.debonis/lda_experiments/authors.fdup.gt2.conf.json
-groundTruthFieldJPath = $.orcid
+entitiesPath = /tmp/publications_test_dump
+#entitiesPath = /tmp/prod_provision/graph/02_graph_cleaned/publication
+workingPath = /user/michele.debonis/new_dedup_test/workingdirtree
+dedupConfPath = /user/michele.debonis/new_dedup_test/pubs.tree.conf.json
+numPartitions = 8000
+useTree = true
--- a/dnet-dedup-test/pom.xml
+++ b/dnet-dedup-test/pom.xml
@ -6,7 +6,7 @@
    <parent>
        <groupId>eu.dnetlib</groupId>
        <artifactId>dnet-dedup</artifactId>
-        <version>4.1.13-SNAPSHOT</version>
+        <version>4.1.9</version>
        <relativePath>../pom.xml</relativePath>
    </parent>

--- a/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/Deduper.java
@ -1,7 +1,7 @@
 package eu.dnetlib;

 import com.google.common.hash.Hashing;
-import eu.dnetlib.graph.JavaGraphProcessor;
+import eu.dnetlib.graph.GraphProcessor;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.model.MapDocument;
 import eu.dnetlib.pace.util.BlockProcessorForTesting;
@ -19,6 +19,7 @@ import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.graphx.Edge;
+import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
@ -57,13 +58,14 @@ public class Deduper implements Serializable {
                .reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize));
    }

-    public static Iterator<Tuple2<String, String>> ccToMergeRel(Tuple2<String, List<String>> cc, DedupConfig dedupConf) {
-        return cc._2()
+    public static Iterator<Tuple2<String, String>> ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf) {
+        return cc
+                .getDocs()
                .stream()
                .flatMap(
                        id -> {
                            List<Tuple2<String, String>> tmp = new ArrayList<>();
-                            tmp.add(new Tuple2<>(cc._1(), id));
+                            tmp.add(new Tuple2<>(cc.getCcId(), id));
                            return tmp.stream();
                        })
                .iterator();
@ -136,19 +138,21 @@ public class Deduper implements Serializable {
                .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
                .mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));

-        final JavaRDD<Edge<String>> edgeRdd = spark
+        final RDD<Edge<String>> edgeRdd = spark
                .read()
                .load(simRelsPath)
                .as(Encoders.bean(Relation.class))
                .javaRDD()
-                .map(Relation::toEdgeRdd);
+                .map(Relation::toEdgeRdd)
+                .rdd();

-        JavaPairRDD<String, List<String>> ccs = JavaGraphProcessor
-                .findCCs(vertexes, edgeRdd, dedupConf.getWf().getMaxIterations());
+        JavaRDD<ConnectedComponent> ccs = GraphProcessor
+                .findCCs(vertexes.rdd(), edgeRdd, maxIterations)
+                .toJavaRDD();

        JavaRDD<Relation> mergeRel = ccs
-                .filter(cc -> cc._2().size() > 1)
-                .flatMap(cc -> Deduper.ccToMergeRel(cc, dedupConf))
+                .filter(k -> k.getDocs().size() > 1)
+                .flatMap(cc -> ccToMergeRel(cc, dedupConf))
                .map(it -> new Relation(it._1(), it._2(), "mergeRel"));

        final Dataset<Relation> mergeRels = spark
@ -159,7 +163,7 @@ public class Deduper implements Serializable {
        mergeRels.write().mode(SaveMode.Overwrite).parquet(mergeRelsPath);
    }

-    public static void createDedupEntity(DedupConfig dedupConf, String simRelsPath, String mergeRelsPath, String entitiesPath, SparkSession spark, String dedupEntityPath){
+    public static void createDedupEntity(DedupConfig dedupConf, String mergeRelsPath, String entitiesPath, SparkSession spark, String dedupEntityPath){

        JavaPairRDD<String, String> entities = spark
                .read()
@ -170,15 +174,7 @@ public class Deduper implements Serializable {
                .toJavaRDD()
                .mapToPair(t -> t);

-        // <source_raw_id, relation(source, target)>
-        JavaPairRDD<String, Relation> simRels = spark
-                .read()
-                .load(simRelsPath)
-                .as(Encoders.bean(Relation.class))
-                .toJavaRDD()
-                .mapToPair(r-> new Tuple2<>(r.getSource(), r));
-
-        // <raw_id, relation(dedup_id, raw_id)>
+        // <source, target>: source is the dedup_id, target is the id of the mergedIn
        JavaPairRDD<String, Relation> mergeRels = spark
                .read()
                .load(mergeRelsPath)
@ -191,22 +187,7 @@ public class Deduper implements Serializable {
                .groupByKey()
                .map(t-> entityMerger(t._1(), t._2().iterator()));

-        JavaPairRDD<String, Iterable<Relation>> simRelsWithDedupId = simRels
-                .join(mergeRels)
-                .mapToPair(x -> new Tuple2<>(x._2()._2().getSource(), x._2()._1()))
-                .groupByKey();
-
-        JavaRDD<ConnectedComponent> groupEntity = mergeRels.join(entities)
-                .mapToPair(t -> new Tuple2<>(t._2()._1().getSource(), t._2()._2()))
-                .groupByKey()
-                .join(simRelsWithDedupId)
-                .map(x -> new ConnectedComponent(
-                        x._1(),
-                        x._2()._1(),
-                        x._2()._2())
-                );
-
-        groupEntity.saveAsTextFile(dedupEntityPath);
+        dedupEntities.saveAsTextFile(dedupEntityPath);
    }

 }
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/graph/JavaGraphProcessor.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/graph/JavaGraphProcessor.java
@ -1,56 +0,0 @@
-package eu.dnetlib.graph;
-
-import com.clearspring.analytics.util.Lists;
-import com.google.common.collect.Sets;
-import eu.dnetlib.pace.utils.Utility;
-import eu.dnetlib.support.ConnectedComponent;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.graphx.*;
-import org.apache.spark.rdd.RDD;
-import org.apache.spark.storage.StorageLevel;
-import scala.Tuple2;
-import scala.reflect.ClassTag;
-import scala.reflect.ClassTag$;
-
-import java.util.List;
-
-public class JavaGraphProcessor {
-
-    //<ccId, list(json)>
-    public static JavaPairRDD<String, List<String>> findCCs(JavaPairRDD<Object, String> vertexes, JavaRDD<Edge<String>> edges, int maxIterations) {
-
-        ClassTag<String> stringTag = ClassTag$.MODULE$.apply(String.class);
-        Graph<String, String> graph =
-                Graph.apply(
-                        vertexes.rdd(),
-                        edges.rdd(),
-                        "",
-                        StorageLevel.MEMORY_ONLY(),
-                        StorageLevel.MEMORY_ONLY(),
-                        stringTag,
-                        stringTag
-                );
-
-        GraphOps<String, String> graphOps = new GraphOps<>(graph, stringTag, stringTag);
-        JavaRDD<Tuple2<Object, Object>> cc = graphOps.connectedComponents(maxIterations).vertices().toJavaRDD();
-
-        JavaPairRDD<Object, String> joinResult = vertexes
-                .leftOuterJoin(cc.mapToPair(x -> x))
-                .mapToPair(x -> {
-                    if (!x._2()._2().isPresent()) {
-                        return new Tuple2<>(x._1(), x._2()._1());
-                    } else {
-                        return new Tuple2<>(x._2()._2(), x._2()._1());
-                    }
-                });
-
-        return joinResult
-                .groupByKey()
-                .map(x -> Lists.newArrayList(x._2()))
-                .zipWithUniqueId()
-                .mapToPair(x -> new Tuple2<>("dedup______::" + x._2().toString(), x._1()));
-
-    }
-
-}
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java
@ -19,7 +19,6 @@ import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.Serializable;
-import java.nio.charset.StandardCharsets;
 import java.util.stream.Collectors;

 public abstract class AbstractSparkJob implements Serializable {
@ -60,7 +59,7 @@ public abstract class AbstractSparkJob implements Serializable {

        Path path=new Path(filePath);
        FileSystem fs = FileSystem.get(new Configuration());
-        BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path), StandardCharsets.UTF_8));
+        BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path)));
        try {
            return String.join("", br.lines().collect(Collectors.toList()));
        } finally {
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkComputeStatistics.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkComputeStatistics.java
@ -1,36 +1,20 @@
 package eu.dnetlib.jobs;

-import eu.dnetlib.Deduper;
-import eu.dnetlib.pace.config.DedupConfig;
-import eu.dnetlib.pace.config.Type;
-import eu.dnetlib.pace.model.FieldValueImpl;
-import eu.dnetlib.pace.model.MapDocument;
-import eu.dnetlib.pace.util.MapDocumentUtil;
-import eu.dnetlib.pace.utils.Utility;
 import eu.dnetlib.support.ArgumentApplicationParser;
-import eu.dnetlib.support.Block;
-import eu.dnetlib.support.ConnectedComponent;
 import eu.dnetlib.support.Relation;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
-import org.codehaus.jackson.map.ObjectMapper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import scala.Tuple2;

 import java.io.IOException;
-import java.util.List;
 import java.util.Optional;
-import java.util.stream.Collectors;

 public class SparkComputeStatistics extends AbstractSparkJob {

@ -58,42 +42,18 @@ public class SparkComputeStatistics extends AbstractSparkJob {

        @Override
        public void run() throws IOException {
-            //https://towardsdatascience.com/7-evaluation-metrics-for-clustering-algorithms-bdc537ff54d2#:~:text=There%20are%20two%20types%20of,to%20all%20unsupervised%20learning%20results)
+
            // read oozie parameters
            final String entitiesPath = parser.get("entitiesPath");
            final String workingPath = parser.get("workingPath");
-            final String dedupConfPath = parser.get("dedupConfPath");
-            final String groundTruthFieldJPath = parser.get("groundTruthFieldJPath");
            final int numPartitions = Optional
                    .ofNullable(parser.get("numPartitions"))
                    .map(Integer::valueOf)
                    .orElse(NUM_PARTITIONS);

-            log.info("entitiesPath:          '{}'", entitiesPath);
-            log.info("workingPath:           '{}'", workingPath);
-            log.info("numPartitions:         '{}'", numPartitions);
-            log.info("dedupConfPath:         '{}'", dedupConfPath);
-            log.info("groundTruthFieldJPath: '{}'", groundTruthFieldJPath);
-
-            JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
-            DedupConfig dedupConfig = loadDedupConfig(dedupConfPath);
-
-            JavaPairRDD<String, MapDocument> mapDocuments = sc
-                    .textFile(entitiesPath)
-                    .repartition(numPartitions)
-                    .mapToPair(
-                            (PairFunction<String, String, MapDocument>) s -> {
-                                MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, s);
-                                //put in the map the groundTruthField used to compute statistics
-                                d.getFieldMap().put("groundTruth", new FieldValueImpl(Type.String, "groundTruth", MapDocumentUtil.getJPathString(groundTruthFieldJPath, s)));
-                                return new Tuple2<>(d.getIdentifier(), d);
-                            });
-
-            JavaRDD<String> entities = mapDocuments.map(d -> d._2().getFieldMap().get("groundTruth").stringValue());
-
-            // create blocks
-            JavaRDD<List<String>> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConfig)
-                    .map(b -> b._2().getDocuments().stream().map(d -> d.getFieldMap().get("groundTruth").stringValue()).collect(Collectors.toList()));
+            log.info("entitiesPath:  '{}'", entitiesPath);
+            log.info("workingPath:   '{}'", workingPath);
+            log.info("numPartitions: '{}'", numPartitions);

            // <source, target>: source is the dedup_id, target is the id of the mergedIn
            JavaRDD<Relation> mergerels = spark
@ -108,38 +68,15 @@ public class SparkComputeStatistics extends AbstractSparkJob {
                    .as(Encoders.bean(Relation.class))
                    .toJavaRDD();

-            JavaRDD<List<String>> groups = sc.textFile(workingPath + "/groupentities")
-                    .map(e -> new ObjectMapper().readValue(e, ConnectedComponent.class))
-                    .map(e -> e.getDocs().stream().map(d -> MapDocumentUtil.getJPathString(groundTruthFieldJPath, d)).collect(Collectors.toList()));
-
-            long entities_number = entities.count();
-            long blocks_number = blocks.count();
-            double blocks_randIndex = randIndex(blocks);
            long simrels_number = simrels.count();
            long mergerels_number = mergerels.count();
-            double groups_randIndex = randIndex(groups);
-            long groups_number = groups.count();
-            long groundtruth_number = entities.filter(e -> !e.isEmpty()).count();
-            long correct_groups = groups.filter(x -> x.stream().distinct().count()==1).count();
-            long wrong_groups = groups_number - correct_groups;
+            long connected_components = mergerels.groupBy(Relation::getSource).count();

-            String print =
-                    "Entities : " + entities_number + "\n" +
-                    "Ground Truth : " + groundtruth_number + "\n" +
-                    "Blocks : " + blocks_number + "\n" +
-                    "Blocks RI : " + blocks_randIndex + "\n" +
-                    "SimRels : " + simrels_number + "\n" +
-                    "MergeRels : " + mergerels_number + "\n" +
-                    "Groups : " + groups_number + " (correct: " + correct_groups + ", wrong: " + wrong_groups + ")\n" +
-                    "Groups RI : " + groups_randIndex;
-
-            System.out.println(print);
-
-            writeStatsFileToHDFS(groundtruth_number, entities_number, blocks_randIndex, groups_randIndex, blocks_number, simrels_number, mergerels_number, groups_number, workingPath + "/stats_file.txt");
+            writeStatsFileToHDFS(simrels_number, mergerels_number, connected_components, workingPath + "/stats_file");

        }

-        public static void writeStatsFileToHDFS(long groundtruth_number, long entities_number, double blocks_randIndex, double groups_randIndex, long blocks_number, long simrels_number, long mergerels_number, long groups_number, String filePath) throws IOException {
+        public static void writeStatsFileToHDFS(long simrels_number, long mergerels_number, long connected_components, String filePath) throws IOException {
            Configuration conf = new Configuration();

            FileSystem fs = FileSystem.get(conf);
@ -156,14 +93,9 @@ public class SparkComputeStatistics extends AbstractSparkJob {
                }

                String print =
-                        "Entities : " + entities_number + "\n" +
-                        "Ground Truth : " + groundtruth_number + "\n" +
-                        "Blocks : " + blocks_number + "\n" +
-                        "Blocks RI : " + blocks_randIndex + "\n" +
-                        "SimRels : " + simrels_number + "\n" +
-                        "MergeRels : " + mergerels_number + "\n" +
-                        "Groups : " + groups_number + "\n" +
-                        "Groups RI : " + groups_randIndex;
+                        "Similarity Relations : " + simrels_number + "\n" +
+                        "Merge Relations : " + mergerels_number + "\n" +
+                        "Connected Components : " + connected_components;

                // Create file to write
                FSDataOutputStream out = fs.create(outFile);
@ -177,31 +109,5 @@ public class SparkComputeStatistics extends AbstractSparkJob {
                e.printStackTrace();
            }
        }
-
-        //TODO find another maesure that takes into account all the elements outside of the group too
-        //RandIndex = number of pairwise correct predictions/total number of possible pairs (in the same cluster) -> bounded between 0 and 1
-        public double randIndex(JavaRDD<List<String>> clusters) {
-
-            Tuple2<Integer, Integer> reduce = clusters.map(c -> {
-                        int num = 0;
-                        for (String id : c.stream().distinct().filter(s -> !s.isEmpty()).collect(Collectors.toList())) {
-                            int n = (int) c.stream().filter(i -> i.equals(id)).count();
-                            num += binomialCoefficient(n);
-                        }
-                        int den = binomialCoefficient(c.size());
-                        return new Tuple2<>(num, den);
-                    })
-                    .reduce((a, b) -> new Tuple2<>(a._1() + b._1(), a._2() + b._2()));
-
-            return (double)reduce._1()/ reduce._2();
-        }
-
-        private static int binomialCoefficient(int n)
-        {
-            return n*(n-1)/2;
-        }
-
-        //V-measure = harmonic mean of homogeneity and completeness, homogeneity = each cluster contains only members of a single class, completeness = all members of a given class are assigned to the same cluster
-
 }

--- a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateDedupEntity.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateDedupEntity.java
@ -7,7 +7,6 @@ import eu.dnetlib.pace.utils.Utility;
 import eu.dnetlib.support.ArgumentApplicationParser;
 import eu.dnetlib.support.ConnectedComponent;
 import eu.dnetlib.support.Relation;
-import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
@ -17,32 +16,29 @@ import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import scala.Tuple2;
-import scala.Tuple3;

 import java.io.IOException;
-import java.util.*;
-import java.util.stream.Collectors;
-import java.util.stream.StreamSupport;
+import java.util.Optional;

-public class SparkCreateGroupEntity extends AbstractSparkJob {
+public class SparkCreateDedupEntity extends AbstractSparkJob {

-        private static final Logger log = LoggerFactory.getLogger(SparkCreateGroupEntity.class);
+        private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.jobs.SparkCreateDedupEntity.class);

-        public SparkCreateGroupEntity(ArgumentApplicationParser parser, SparkSession spark) {
+        public SparkCreateDedupEntity(ArgumentApplicationParser parser, SparkSession spark) {
            super(parser, spark);
        }

        public static void main(String[] args) throws Exception {

            ArgumentApplicationParser parser = new ArgumentApplicationParser(
-                    Utility.readResource("/jobs/parameters/createGroupEntity_parameters.json", SparkCreateGroupEntity.class)
+                    Utility.readResource("/jobs/parameters/createDedupEntity_parameters.json", SparkCreateDedupEntity.class)
            );

            parser.parseArgument(args);

            SparkConf conf = new SparkConf();

-            new SparkCreateGroupEntity(
+            new SparkCreateDedupEntity(
                    parser,
                    getSparkSession(conf)
            ).run();
@ -67,7 +63,6 @@ public class SparkCreateGroupEntity extends AbstractSparkJob {

            DedupConfig dedupConf = DedupConfig.load(readFileFromHDFS(dedupConfPath));

-            // <raw_id, json>
            JavaPairRDD<String, String> entities = spark
                    .read()
                    .textFile(entitiesPath)
@ -77,15 +72,7 @@ public class SparkCreateGroupEntity extends AbstractSparkJob {
                    .toJavaRDD()
                    .mapToPair(t -> t);

-            // <source_raw_id, relation(source, target)>
-            JavaPairRDD<String, Relation> simRels = spark
-                    .read()
-                    .load(workingPath + "/simrels")
-                    .as(Encoders.bean(Relation.class))
-                    .toJavaRDD()
-                    .mapToPair(r-> new Tuple2<>(r.getSource(), r));
-
-            // <raw_id, relation(dedup_id, raw_id)>
+            // <source, target>: source is the dedup_id, target is the id of the mergedIn
            JavaPairRDD<String, Relation> mergeRels = spark
                    .read()
                    .load(workingPath + "/mergerels")
@ -93,23 +80,12 @@ public class SparkCreateGroupEntity extends AbstractSparkJob {
                    .toJavaRDD()
                    .mapToPair(r -> new Tuple2<>(r.getTarget(), r));

-            // <dedup_id, simrel>
-            JavaPairRDD<String, Iterable<Relation>> simRelsWithDedupId = simRels
-                    .join(mergeRels)
-                    .mapToPair(x -> new Tuple2<>(x._2()._2().getSource(), x._2()._1()))
-                    .groupByKey();
-
-            JavaRDD<ConnectedComponent> groupEntity = mergeRels.join(entities)
+            JavaRDD<ConnectedComponent> dedupEntities = mergeRels.join(entities)
                    .mapToPair(t -> new Tuple2<>(t._2()._1().getSource(), t._2()._2()))
                    .groupByKey()
-                    .join(simRelsWithDedupId)
-                    .map(x -> new ConnectedComponent(
-                            x._1(),
-                            x._2()._1(),
-                            x._2()._2())
-                    );
+                    .map(t-> Deduper.entityMerger(t._1(), t._2().iterator()));

-            groupEntity.saveAsTextFile(workingPath + "/groupentities", GzipCodec.class);
+            dedupEntities.saveAsTextFile(workingPath + "dedupentity");

        }

--- a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateMergeRels.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateMergeRels.java
@ -1,7 +1,7 @@
 package eu.dnetlib.jobs;

 import eu.dnetlib.Deduper;
-import eu.dnetlib.graph.JavaGraphProcessor;
+import eu.dnetlib.graph.GraphProcessor;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 import eu.dnetlib.pace.utils.Utility;
@ -24,7 +24,6 @@ import org.slf4j.LoggerFactory;
 import scala.Tuple2;

 import java.io.IOException;
-import java.util.List;
 import java.util.Optional;

 import static eu.dnetlib.Deduper.hash;
@ -79,18 +78,20 @@ public class SparkCreateMergeRels extends AbstractSparkJob {
                .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
                .mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));

-        final JavaRDD<Edge<String>> edgeRdd = spark
+        final RDD<Edge<String>> edgeRdd = spark
                .read()
                .load(workingPath + "/simrels")
                .as(Encoders.bean(Relation.class))
                .javaRDD()
-                .map(Relation::toEdgeRdd);
+                .map(Relation::toEdgeRdd)
+                .rdd();

-        JavaPairRDD<String, List<String>> ccs = JavaGraphProcessor
-                .findCCs(vertexes, edgeRdd, dedupConf.getWf().getMaxIterations());
+        JavaRDD<ConnectedComponent> ccs = GraphProcessor
+                .findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations())
+                .toJavaRDD();

        JavaRDD<Relation> mergeRel = ccs
-                .filter(cc -> cc._2().size() > 1)
+                .filter(k -> k.getDocs().size() > 1)
                .flatMap(cc -> Deduper.ccToMergeRel(cc, dedupConf))
                .map(it -> new Relation(it._1(), it._2(), "mergeRel"));

--- a/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateSimRels.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/jobs/SparkCreateSimRels.java
@ -14,7 +14,6 @@ import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.sql.Encoder;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/support/Block.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/support/Block.java
@ -1,7 +1,10 @@
 package eu.dnetlib.support;

 import java.io.Serializable;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
@ -9,7 +12,6 @@ import java.util.stream.StreamSupport;
 import com.google.common.collect.Lists;

 import eu.dnetlib.pace.model.MapDocument;
-import org.codehaus.jackson.annotate.JsonIgnore;

 public class Block implements Serializable {

@ -21,11 +23,6 @@ public class Block implements Serializable {
        super();
    }

-    public Block(String key, List<MapDocument> documents) {
-        this.key = key;
-        this.documents = documents;
-    }
-
    public Block(String key, Iterable<MapDocument> documents) {
        this.key = key;
        this.documents = Lists.newArrayList(documents);
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/support/ConnectedComponent.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/support/ConnectedComponent.java
@ -5,35 +5,54 @@ import java.io.Serializable;
 import java.util.HashSet;
 import java.util.Set;

-import com.google.common.collect.Sets;
+import eu.dnetlib.pace.utils.Utility;
+import org.apache.commons.lang.StringUtils;
+import org.codehaus.jackson.annotate.JsonIgnore;
+
+import com.fasterxml.jackson.databind.ObjectMapper;

 import eu.dnetlib.pace.util.PaceException;
-import org.codehaus.jackson.map.ObjectMapper;

 public class ConnectedComponent implements Serializable {

    private HashSet<String> docs;
    private String ccId;
-    private HashSet<Relation> simrels;

    public ConnectedComponent() {
    }

-    public ConnectedComponent(String ccId, Set<String> docs, Set<Relation> simrels) {
-        this.docs = new HashSet<>(docs);
-        this.ccId = ccId;
-        this.simrels = new HashSet<>(simrels);
-    }
-
    public ConnectedComponent(Set<String> docs) {
        this.docs = new HashSet<>(docs);
-        //initialization of id and relations missing
+        createID();
    }

-    public ConnectedComponent(String ccId, Iterable<String> docs, Iterable<Relation> simrels) {
-        this.ccId = ccId;
-        this.docs = Sets.newHashSet(docs);
-        this.simrels = Sets.newHashSet(simrels);
+    public String createID() {
+        if (docs.size() > 1) {
+            final String s = getMin();
+            ccId = "dedup::" + Utility.md5(s);
+            return ccId;
+        } else {
+            return docs.iterator().next();
+        }
+    }
+
+    @JsonIgnore
+    public String getMin() {
+
+        final StringBuilder min = new StringBuilder();
+        docs
+                .forEach(
+                        i -> {
+                            if (StringUtils.isBlank(min.toString())) {
+                                min.append(i);
+                            } else {
+                                if (min.toString().compareTo(i) > 0) {
+                                    min.setLength(0);
+                                    min.append(i);
+                                }
+                            }
+                        });
+        return min.toString();
    }

    @Override
@ -61,12 +80,4 @@ public class ConnectedComponent implements Serializable {
    public void setCcId(String ccId) {
        this.ccId = ccId;
    }
-
-    public void setSimrels(HashSet<Relation> simrels) {
-        this.simrels = simrels;
-    }
-
-    public HashSet<Relation> getSimrels() {
-        return simrels;
-    }
 }
--- a/dnet-dedup-test/src/main/resources/dedup/oozie_app/workflow.xml
+++ b/dnet-dedup-test/src/main/resources/dedup/oozie_app/workflow.xml
@ -16,10 +16,6 @@
            <name>dedupConfPath</name>
            <description>path for the dedup configuration file</description>
        </property>
-        <property>
-            <name>groundTruthFieldJPath</name>
-            <description>jpath of the field to be used as ground truth</description>
-        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -142,33 +138,6 @@
            <arg>--numPartitions</arg><arg>${numPartitions}</arg>
            <arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
        </spark>
-        <ok to="CreateGroupEntities"/>
-        <error to="Kill"/>
-    </action>
-
-
-    <action name="CreateGroupEntities">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Create Group Entities</name>
-            <class>eu.dnetlib.jobs.SparkCreateGroupEntity</class>
-            <jar>dnet-dedup-test-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=3840
-            </spark-opts>
-            <arg>--entitiesPath</arg><arg>${entitiesPath}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--numPartitions</arg><arg>${numPartitions}</arg>
-            <arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
-        </spark>
        <ok to="ComputeStatistics"/>
        <error to="Kill"/>
    </action>
@ -193,12 +162,36 @@
            <arg>--entitiesPath</arg><arg>${entitiesPath}</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>--numPartitions</arg><arg>${numPartitions}</arg>
-            <arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
-            <arg>--groundTruthFieldJPath</arg><arg>${groundTruthFieldJPath}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>

+    <!--<action name="CreateDedupEntities">-->
+        <!--<spark xmlns="uri:oozie:spark-action:0.2">-->
+            <!--<master>yarn</master>-->
+            <!--<mode>cluster</mode>-->
+            <!--<name>Create Dedup Entities</name>-->
+            <!--<class>eu.dnetlib.jobs.SparkCreateDedupEntity</class>-->
+            <!--<jar>dnet-dedup-test-${projectVersion}.jar</jar>-->
+            <!--<spark-opts>-->
+                <!--&#45;&#45;executor-memory=${sparkExecutorMemory}-->
+                <!--&#45;&#45;executor-cores=${sparkExecutorCores}-->
+                <!--&#45;&#45;driver-memory=${sparkDriverMemory}-->
+                <!--&#45;&#45;conf spark.extraListeners=${spark2ExtraListeners}-->
+                <!--&#45;&#45;conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
+                <!--&#45;&#45;conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
+                <!--&#45;&#45;conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
+                <!--&#45;&#45;conf spark.sql.shuffle.partitions=3840-->
+            <!--</spark-opts>-->
+            <!--<arg>&#45;&#45;entitiesPath</arg><arg>${entitiesPath}</arg>-->
+            <!--<arg>&#45;&#45;workingPath</arg><arg>${workingPath}</arg>-->
+            <!--<arg>&#45;&#45;numPartitions</arg><arg>${numPartitions}</arg>-->
+            <!--<arg>&#45;&#45;dedupConfPath</arg><arg>${dedupConfPath}</arg>-->
+        <!--</spark>-->
+        <!--<ok to="End"/>-->
+        <!--<error to="Kill"/>-->
+    <!--</action>-->
+
    <end name="End"/>
 </workflow-app>
--- a/dnet-dedup-test/src/main/resources/jobs/parameters/computeStatistics_parameters.json
+++ b/dnet-dedup-test/src/main/resources/jobs/parameters/computeStatistics_parameters.json
@ -16,17 +16,5 @@
    "paramLongName": "numPartitions",
    "paramDescription": "number of partitions for the similarity relations intermediate phases",
    "paramRequired": false
-  },
-  {
-    "paramName": "dc",
-    "paramLongName": "dedupConfPath",
-    "paramDescription": "dedup configuration to be used",
-    "paramRequired": true
-  },
-  {
-    "paramName": "gt",
-    "paramLongName": "groundTruthFieldJPath",
-    "paramDescription": "field to be used as groundtruth",
-    "paramRequired": true
  }
 ]
--- a/dnet-dedup-test/src/main/resources/jobs/parameters/createDedupEntity_parameters.json
+++ b/dnet-dedup-test/src/main/resources/jobs/parameters/createDedupEntity_parameters.json
--- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java
+++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.conf.json
@ -1,134 +0,0 @@
-{
-  "wf": {
-    "threshold": "0.99",
-    "dedupRun": "001",
-    "entityType": "author",
-    "subEntityType": "author",
-    "subEntityValue": "author",
-    "orderField": "fullname",
-    "queueMaxSize": "200",
-    "groupMaxSize": "100",
-    "maxChildren": "100",
-    "slidingWindowSize": "50",
-    "rootBuilder": [
-      "result",
-      "resultProject_outcome_isProducedBy",
-      "resultResult_publicationDataset_isRelatedTo",
-      "resultResult_similarity_isAmongTopNSimilarDocuments",
-      "resultResult_similarity_hasAmongTopNSimilarDocuments",
-      "resultOrganization_affiliation_isAffiliatedWith",
-      "resultResult_part_hasPart",
-      "resultResult_part_isPartOf",
-      "resultResult_supplement_isSupplementTo",
-      "resultResult_supplement_isSupplementedBy",
-      "resultResult_version_isVersionOf"
-    ],
-    "includeChildren": "true",
-    "maxIterations": 20,
-    "idPath": "$.id"
-  },
-  "pace": {
-    "clustering" : [
-      { "name" : "lnfi", "fields" : [ "fullname" ], "params" : {} }
-    ],
-    "decisionTree": {
-      "start": {
-        "fields": [
-          {
-            "field": "orcid",
-            "comparator": "exactMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "orcids",
-        "ignoreUndefined": "true"
-      },
-      "orcids": {
-        "fields": [
-          {
-            "field": "orcids",
-            "comparator": "stringListMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {"type": "count"}
-          }
-        ],
-        "threshold": 3.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "coauthors",
-        "undefined": "coauthors",
-        "ignoreUndefined": "true"
-      },
-      "coauthors": {
-        "fields": [
-          {
-            "field": "coauthors",
-            "comparator": "authorsMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {"type": "count"}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "topicsMatch",
-        "negative": "NO_MATCH",
-        "undefined": "topicsMatch",
-        "ignoreUndefined": "true"
-      },
-      "topicsMatch": {
-        "fields": [
-          {
-            "field": "topics",
-            "comparator": "cosineSimilarity",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {}
-          }
-        ],
-        "threshold": 0.5,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "NO_MATCH",
-        "ignoreUndefined": "false"
-      }
-    },
-    "model": [
-      {
-        "name": "topics",
-        "type": "DoubleArray",
-        "path": "$.topics"
-      },
-      {
-        "name": "fullname",
-        "type": "String",
-        "path": "$.fullname"
-      },
-      {
-        "name": "orcid",
-        "type": "String",
-        "path": "$.orcid"
-      },
-      {
-        "name": "coauthors",
-        "type": "List",
-        "path": "$.coAuthors[*].fullname"
-      },
-      {
-        "name": "orcids",
-        "type": "List",
-        "path": "$.coAuthors[*].orcid"
-      }
-    ],
-    "blacklists": {},
-    "synonyms": {}
-  }
-}
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.soft.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/authors.fdup.soft.conf.json
@ -1,134 +0,0 @@
-{
-  "wf": {
-    "threshold": "0.99",
-    "dedupRun": "001",
-    "entityType": "author",
-    "subEntityType": "author",
-    "subEntityValue": "author",
-    "orderField": "fullname",
-    "queueMaxSize": "200",
-    "groupMaxSize": "100",
-    "maxChildren": "100",
-    "slidingWindowSize": "50",
-    "rootBuilder": [
-      "result",
-      "resultProject_outcome_isProducedBy",
-      "resultResult_publicationDataset_isRelatedTo",
-      "resultResult_similarity_isAmongTopNSimilarDocuments",
-      "resultResult_similarity_hasAmongTopNSimilarDocuments",
-      "resultOrganization_affiliation_isAffiliatedWith",
-      "resultResult_part_hasPart",
-      "resultResult_part_isPartOf",
-      "resultResult_supplement_isSupplementTo",
-      "resultResult_supplement_isSupplementedBy",
-      "resultResult_version_isVersionOf"
-    ],
-    "includeChildren": "true",
-    "maxIterations": 20,
-    "idPath": "$.id"
-  },
-  "pace": {
-    "clustering" : [
-      { "name" : "lnfi", "fields" : [ "fullname" ], "params" : {} }
-    ],
-    "decisionTree": {
-      "start": {
-        "fields": [
-          {
-            "field": "orcid",
-            "comparator": "exactMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "orcids",
-        "ignoreUndefined": "true"
-      },
-      "orcids": {
-        "fields": [
-          {
-            "field": "orcids",
-            "comparator": "stringListMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {"type": "count"}
-          }
-        ],
-        "threshold": 3.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "coauthors",
-        "undefined": "coauthors",
-        "ignoreUndefined": "true"
-      },
-      "coauthors": {
-        "fields": [
-          {
-            "field": "coauthors",
-            "comparator": "authorsMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {"type": "count"}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "topicsMatch",
-        "negative": "NO_MATCH",
-        "undefined": "topicsMatch",
-        "ignoreUndefined": "true"
-      },
-      "topicsMatch": {
-        "fields": [
-          {
-            "field": "topics",
-            "comparator": "cosineSimilarity",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "NO_MATCH",
-        "ignoreUndefined": "false"
-      }
-    },
-    "model": [
-      {
-        "name": "topics",
-        "type": "DoubleArray",
-        "path": "$.topics"
-      },
-      {
-        "name": "fullname",
-        "type": "String",
-        "path": "$.fullname"
-      },
-      {
-        "name": "orcid",
-        "type": "String",
-        "path": "$.orcid"
-      },
-      {
-        "name": "coauthors",
-        "type": "List",
-        "path": "$.coAuthors[*].fullname"
-      },
-      {
-        "name": "orcids",
-        "type": "List",
-        "path": "$.coAuthors[*].orcid"
-      }
-    ],
-    "blacklists": {},
-    "synonyms": {}
-  }
-}
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/ds.tree.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/ds.tree.conf.json
@ -3,7 +3,7 @@
    "threshold" : "0.99",
    "dedupRun" : "001",
    "entityType" : "datasource",
-    "orderField" : "englishname",
+    "orderField" : "name",
    "queueMaxSize" : "2000",
    "groupMaxSize" : "50",
    "slidingWindowSize" : "200",
@ -14,9 +14,8 @@
  },
  "pace" : {
    "clustering" : [
-      { "name" : "sortedngrampairs", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 2, "ngramLen" : "3", "collapseOn:name": "0"} },
-      { "name" : "suffixprefix", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 1, "len" : "3", "collapseOn:name": "0" } },
-      {"name" : "ngrams", "fields" : ["officialname", "englishname"], "params" : {"ngramLen": 4, "max" : 2, "maxPerToken": 2, "minNgramLen": 1, "collapseOn:name": "0"}},
+      { "name" : "sortedngrampairs", "fields" : [ "name" ], "params" : { "max" : 2, "ngramLen" : "3"} },
+      { "name" : "suffixprefix", "fields" : [ "name" ], "params" : { "max" : 1, "len" : "3" } },
      { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
    ],
    "decisionTree" : {
@ -40,36 +39,16 @@
      "layer2": {
        "fields": [
          {
-            "field": "officialname",
+            "field": "name",
            "comparator": "levensteinTitle",
            "weight": 1.0,
            "countIfUndefined": "true",
            "params": {
-              "threshold": 0.9
-            }
-          },
-          {
-            "field": "englishname",
-            "comparator": "levensteinTitle",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {
-              "threshold": 0.9
-            }
-          },
-          {
-            "field": "officialname",
-            "comparator": "levensteinTitle",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {
-              "crossCompare": "englishname",
-              "threshold": 0.9
            }
          }
        ],
        "threshold": 0.9,
-        "aggregation": "MAX",
+        "aggregation": "AVG",
        "positive": "MATCH",
        "negative": "NO_MATCH",
        "undefined": "NO_MATCH",
@ -77,11 +56,12 @@
      }
    },
    "model" : [
-      { "name" : "englishname", "type" : "String", "path" : "$.englishname" },
-      { "name" : "officialname", "type" : "String", "path" : "$.officialname" },
+      { "name" : "name", "type" : "String", "path" : "$.name" },
      { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" }
    ],
-    "blacklists" : {},
+    "blacklists" : {
+      "legalname" : []
+    },
    "synonyms": {}
  }
 }
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json
@ -3,9 +3,8 @@
    "threshold" : "0.99",
    "dedupRun" : "001",
    "entityType" : "organization",
-    "subEntityValue": "organization",
    "orderField" : "legalname",
-    "queueMaxSize" : "100000",
+    "queueMaxSize" : "2000",
    "groupMaxSize" : "50",
    "slidingWindowSize" : "200",
    "idPath":"$.id",
@ -144,10 +143,10 @@
      }
    },
    "model" : [
-      { "name" : "country", "type" : "String", "path" : "$.country.classid"},
-      { "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
-      { "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
-      { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
+      { "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"},
+      { "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"},
+      { "name" : "legalname", "type" : "String", "path" : "$.organization.metadata.legalname.value" },
+      { "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" },
      { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"},
      { "name" : "originalId", "type" : "String", "path" : "$.id" }
    ],
@ -155,7 +154,7 @@
      "legalname" : []
    },
    "synonyms": {
-      "key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti", "Πανεπιστήμιο", "panepistemio"],
+      "key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
      "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
      "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
      "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
@ -164,7 +163,7 @@
      "key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"],
      "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
      "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
-      "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό", "eθνικό"],
+      "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
      "key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
      "key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
      "key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pub.new.tree.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pub.new.tree.conf.json
@ -178,7 +178,6 @@
            "params": {
              "surname_th": 0.75,
              "fullname_th": 0.75,
-              "size_th": 20,
              "mode": "surname"
            }
          }
@ -216,8 +215,8 @@
      },
      {
        "name": "title",
-        "type": "StringConcat",
-        "path": "$.title[?(@.qualifier.classid == 'main title')].value|||$.title[?(@.qualifier.classid == 'subtitle')].value",
+        "type": "String",
+        "path": "$.title[?(@.qualifier.classid == 'main title')].value",
        "length": 250,
        "size": 5
      },
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pub.prod.tree.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pub.prod.tree.conf.json
@ -51,6 +51,37 @@
    ],
    "decisionTree": {
      "start": {
+        "fields": [
+          {
+            "field": "pid",
+            "comparator": "jsonListMatch",
+            "weight": 1.0,
+            "countIfUndefined": "false",
+            "params": {
+              "jpath_value": "$.value",
+              "jpath_classid": "$.qualifier.classid"
+            }
+          },
+          {
+            "field": "pid",
+            "comparator": "jsonListMatch",
+            "weight": 1.0,
+            "countIfUndefined": "false",
+            "params": {
+              "jpath_value": "$.value",
+              "jpath_classid": "$.qualifier.classid",
+              "crossCompare": "alternateid"
+            }
+          }
+        ],
+        "threshold": 0.5,
+        "aggregation": "MAX",
+        "positive": "layer1",
+        "negative": "layer2",
+        "undefined": "layer2",
+        "ignoreUndefined": "true"
+      },
+      "layer1": {
        "fields": [
          {
            "field": "title",
@ -63,8 +94,49 @@
        "threshold": 0.9,
        "aggregation": "AVG",
        "positive": "MATCH",
-        "negative": "MATCH",
-        "undefined": "MATCH",
+        "negative": "NO_MATCH",
+        "undefined": "NO_MATCH",
+        "ignoreUndefined": "true"
+      },
+      "layer2": {
+        "fields": [
+          {
+            "field": "title",
+            "comparator": "titleVersionMatch",
+            "weight": 1.0,
+            "countIfUndefined": "false",
+            "params": {}
+          },
+          {
+            "field": "authors",
+            "comparator": "sizeMatch",
+            "weight": 1.0,
+            "countIfUndefined": "false",
+            "params": {}
+          }
+        ],
+        "threshold": 1.0,
+        "aggregation": "AND",
+        "positive": "layer3",
+        "negative": "NO_MATCH",
+        "undefined": "layer3",
+        "ignoreUndefined": "false"
+      },
+      "layer3": {
+        "fields": [
+          {
+            "field": "title",
+            "comparator": "levensteinTitle",
+            "weight": 1.0,
+            "countIfUndefined": "true",
+            "params": {}
+          }
+        ],
+        "threshold": 0.99,
+        "aggregation": "AVG",
+        "positive": "MATCH",
+        "negative": "NO_MATCH",
+        "undefined": "NO_MATCH",
        "ignoreUndefined": "true"
      }
    },
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json
@ -6,9 +6,9 @@
    "subEntityType": "resulttype",
    "subEntityValue": "publication",
    "orderField": "title",
-    "queueMaxSize": "200",
-    "groupMaxSize": "100",
-    "maxChildren": "100",
+    "queueMaxSize": "5000",
+    "groupMaxSize": "2000",
+    "maxChildren": "1000",
    "slidingWindowSize": "50",
    "rootBuilder": [
      "result",
@ -28,26 +28,9 @@
    "idPath": "$.id"
  },
  "pace": {
-    "clustering": [
-      {
-        "name": "wordsStatsSuffixPrefixChain",
-        "fields": [
-          "title"
-        ],
-        "params": {
-          "mod": "10"
-        }
-      },
-      {
-        "name": "lowercase",
-        "fields": [
-          "doi",
-          "altdoi"
-        ],
-        "params": {
-          "collapseOn:pid": "0"
-        }
-      }
+    "clustering" : [
+      { "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } },
+      { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
    ],
    "decisionTree": {
      "start": {
@ -59,75 +42,18 @@
            "countIfUndefined": "false",
            "params": {
              "jpath_value": "$.value",
-              "jpath_classid": "$.qualifier.classid",
-              "mode": "count"
+              "jpath_classid": "$.qualifier.classid"
            }
          }
        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "instanceTypeCheck",
-        "undefined": "instanceTypeCheck",
-        "ignoreUndefined": "false"
-      },
-      "instanceTypeCheck": {
-        "fields": [
-          {
-            "field": "instance",
-            "comparator": "instanceTypeMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {}
-          }
-        ],
        "threshold": 0.5,
-        "aggregation": "MAX",
-        "positive": "pidVSaltid",
-        "negative": "NO_MATCH",
-        "undefined": "pidVSaltid",
-        "ignoreUndefined": "true"
-      },
-      "pidVSaltid": {
-        "fields": [
-          {
-            "field": "pid",
-            "comparator": "jsonListMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {
-              "jpath_value": "$.value",
-              "jpath_classid": "$.qualifier.classid",
-              "crossCompare": "alternateid",
-              "mode": "count"
-            }
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "softCheck",
-        "negative": "earlyExits",
-        "undefined": "earlyExits",
-        "ignoreUndefined": "true"
-      },
-      "softCheck": {
-        "fields": [
-          {
-            "field": "title",
-            "comparator": "levensteinTitle",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {}
-          }
-        ],
-        "threshold": 0.9,
        "aggregation": "AVG",
        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "NO_MATCH",
+        "negative": "layer2",
+        "undefined": "layer2",
        "ignoreUndefined": "true"
      },
-      "earlyExits": {
+      "layer2": {
        "fields": [
          {
            "field": "title",
@ -146,12 +72,12 @@
        ],
        "threshold": 1.0,
        "aggregation": "AND",
-        "positive": "strongCheck",
+        "positive": "layer3",
        "negative": "NO_MATCH",
-        "undefined": "strongCheck",
+        "undefined": "layer3",
        "ignoreUndefined": "false"
      },
-      "strongCheck": {
+      "layer3": {
        "fields": [
          {
            "field": "title",
@ -163,30 +89,9 @@
        ],
        "threshold": 0.99,
        "aggregation": "AVG",
-        "positive": "surnames",
-        "negative": "NO_MATCH",
-        "undefined": "NO_MATCH",
-        "ignoreUndefined": "true"
-      },
-      "surnames": {
-        "fields": [
-          {
-            "field": "authors",
-            "comparator": "authorsMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {
-              "surname_th": 0.75,
-              "fullname_th": 0.75,
-              "mode": "full"
-            }
-          }
-        ],
-        "threshold": 0.6,
-        "aggregation": "MAX",
        "positive": "MATCH",
        "negative": "NO_MATCH",
-        "undefined": "MATCH",
+        "undefined": "NO_MATCH",
        "ignoreUndefined": "true"
      }
    },
@ -194,29 +99,18 @@
      {
        "name": "doi",
        "type": "String",
-        "path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value"
-      },
-      {
-        "name": "altdoi",
-        "type": "String",
-        "path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
+        "path": "$.pid[?(@.qualifier.classid == 'doi')].value"
      },
      {
        "name": "pid",
        "type": "JSON",
-        "path": "$.instance[*].pid[*]",
-        "overrideMatch": "true"
-      },
-      {
-        "name": "alternateid",
-        "type": "JSON",
-        "path": "$.instance[*].alternateIdentifier[*]",
+        "path": "$.pid",
        "overrideMatch": "true"
      },
      {
        "name": "title",
-        "type": "StringConcat",
-        "path": "$.title[?(@.qualifier.classid == 'main title')].value|||$.title[?(@.qualifier.classid == 'subtitle')].value",
+        "type": "String",
+        "path": "$.title[?(@.qualifier.classid == 'main title')].value",
        "length": 250,
        "size": 5
      },
@ -230,11 +124,6 @@
        "name": "resulttype",
        "type": "String",
        "path": "$.resulttype.classid"
-      },
-      {
-        "name": "instance",
-        "type": "List",
-        "path": "$.instance[*].instancetype.classname"
      }
    ],
    "blacklists": {
@ -465,16 +354,7 @@
        "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
        "^(Measurement of the spin\\-dependent structure function).*",
        "(?i)^.*authors['’′]? reply\\.?$",
-        "(?i)^.*authors['’′]? response\\.?$",
-        "^Data [mM]anagement [sS]ervices\\.$",
-        "Research and Advanced Technology for Digital Libraries",
-        "(?i)^risky business$",
-        "(?i)^great expectations\\.?$",
-        "(?i)^what's in a name\\?$",
-        "(?i)^decisions, decisions\\.?$",
-        "(?i)^update to our reader, reviewer, and author communities.*",
-        "(?i)^lest we forget$",
-        "(?i)^measure for measure$"
+        "(?i)^.*authors['’′]? response\\.?$"
      ]
    },
    "synonyms": {}
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pubs.fdup.exp.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pubs.fdup.exp.json
@ -1,381 +0,0 @@
-{
-  "wf": {
-    "threshold": "0.99",
-    "dedupRun": "001",
-    "entityType": "result",
-    "subEntityType": "resulttype",
-    "subEntityValue": "publication",
-    "orderField": "title",
-    "queueMaxSize": "200",
-    "groupMaxSize": "100",
-    "maxChildren": "100",
-    "slidingWindowSize": "100",
-    "rootBuilder": [
-      "result",
-      "resultProject_outcome_isProducedBy",
-      "resultResult_publicationDataset_isRelatedTo",
-      "resultResult_similarity_isAmongTopNSimilarDocuments",
-      "resultResult_similarity_hasAmongTopNSimilarDocuments",
-      "resultOrganization_affiliation_isAffiliatedWith",
-      "resultResult_part_hasPart",
-      "resultResult_part_isPartOf",
-      "resultResult_supplement_isSupplementTo",
-      "resultResult_supplement_isSupplementedBy",
-      "resultResult_version_isVersionOf"
-    ],
-    "includeChildren": "true",
-    "maxIterations": 20,
-    "idPath": "$.id"
-  },
-  "pace": {
-    "clustering" : [
-      { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
-      { "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid": "0"} }
-    ],
-    "decisionTree": {
-      "start": {
-        "fields": [
-          {
-            "field": "pid",
-            "comparator": "jsonListMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {
-              "jpath_value": "$.value",
-              "jpath_classid": "$.qualifier.classid",
-               "mode": "count"
-            }
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "versionCheck",
-        "undefined": "versionCheck",
-        "ignoreUndefined": "true"
-      },
-      "versionCheck": {
-        "fields": [
-          {
-            "field": "title",
-            "comparator": "titleVersionMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "titleCheck",
-        "negative": "NO_MATCH",
-        "undefined": "titleCheck",
-        "ignoreUndefined": "false"
-      },
-      "titleCheck": {
-        "fields": [
-          {
-            "field": "title",
-            "comparator": "levensteinTitle",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {}
-          }
-        ],
-        "threshold": 0.9,
-        "aggregation": "MAX",
-        "positive": "authorsCheck",
-        "negative": "NO_MATCH",
-        "undefined": "NO_MATCH",
-        "ignoreUndefined": "true"
-      },
-      "authorsCheck": {
-        "fields": [
-          {
-            "field": "authors",
-            "comparator": "authorsMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {}
-          }
-        ],
-        "threshold": 0.6,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "MATCH",
-        "ignoreUndefined": "false"
-      }
-    },
-    "model": [
-      {
-        "name": "doi",
-        "type": "String",
-        "path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value"
-      },
-      {
-        "name": "altdoi",
-        "type": "String",
-        "path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
-      },
-      {
-        "name": "pid",
-        "type": "JSON",
-        "path": "$.instance[*].pid[*]",
-        "overrideMatch": "true"
-      },
-      {
-        "name": "title",
-        "type": "String",
-        "path": "$.title[?(@.qualifier.classid == 'main title')].value",
-        "length": 250,
-        "size": 5
-      },
-      {
-        "name": "authors",
-        "type": "List",
-        "path": "$.author[*].fullname",
-        "size": 200
-      },
-      {
-        "name": "resulttype",
-        "type": "String",
-        "path": "$.resulttype.classid"
-      }
-    ],
-    "blacklists": {
-      "title": [
-        "(?i)^Data Management Plan",
-        "^Inside Front Cover$",
-        "(?i)^Poster presentations$",
-        "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
-        "^Problems with perinatal pathology\\.?$",
-        "(?i)^Cases? of Puerperal Convulsions$",
-        "(?i)^Operative Gyna?ecology$",
-        "(?i)^Mind the gap\\!?\\:?$",
-        "^Chronic fatigue syndrome\\.?$",
-        "^Cartas? ao editor Letters? to the Editor$",
-        "^Note from the Editor$",
-        "^Anesthesia Abstract$",
-        "^Annual report$",
-        "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
-        "(?i)^Graph and Table of Infectious Diseases?$",
-        "^Presentation$",
-        "(?i)^Reviews and Information on Publications$",
-        "(?i)^PUBLIC HEALTH SERVICES?$",
-        "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
-        "(?i)^Adrese autora$",
-        "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
-        "(?i)^Acknowledgement to Referees$",
-        "(?i)^Behçet's disease\\.?$",
-        "(?i)^Isolation and identification of restriction endonuclease.*$",
-        "(?i)^CEREBROVASCULAR DISEASES?.?$",
-        "(?i)^Screening for abdominal aortic aneurysms?\\.?$",
-        "^Event management$",
-        "(?i)^Breakfast and Crohn's disease.*\\.?$",
-        "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
-        "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
-        "^Gushi hakubutsugaku$",
-        "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
-        "^Intestinal spirocha?etosis$",
-        "^Treatment of Rodent Ulcer$",
-        "(?i)^\\W*Cloud Computing\\W*$",
-        "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
-        "^Free Communications, Poster Presentations: Session [A-F]$",
-        "^“The Historical Aspects? of Quackery\\.?”$",
-        "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
-        "^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
-        "(?i)^Case Report$",
-        "^Boletín Informativo$",
-        "(?i)^Glioblastoma Multiforme$",
-        "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
-        "^Zaměstnanecké výhody$",
-        "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
-        "(?i)^Carotid body tumours?\\.?$",
-        "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
-        "^Avant-propos$",
-        "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
-        "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
-        "(?i)^PUBLIC HEALTH VERSUS THE STATE$",
-        "^Viñetas de Cortázar$",
-        "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
-        "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
-        "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
-        "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
-        "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
-        "^Aus der AGMB$",
-        "^Znanstveno-stručni prilozi$",
-        "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
-        "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
-        "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
-        "^Finanční analýza podniku$",
-        "^Financial analysis( of business)?$",
-        "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
-        "^Jikken nihon shūshinsho$",
-        "(?i)^CORONER('|s)(s|') INQUESTS$",
-        "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
-        "(?i)^Consultants' contract(s)?$",
-        "(?i)^Upute autorima$",
-        "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
-        "^Joshi shin kokubun$",
-        "^Kōtō shōgaku dokuhon nōson'yō$",
-        "^Jinjō shōgaku shōka$",
-        "^Shōgaku shūjichō$",
-        "^Nihon joshi dokuhon$",
-        "^Joshi shin dokuhon$",
-        "^Chūtō kanbun dokuhon$",
-        "^Wabun dokuhon$",
-        "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
-        "(?i)^cardiac rehabilitation$",
-        "(?i)^Analytical summary$",
-        "^Thesaurus resolutionum Sacrae Congregationis Concilii$",
-        "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
-        "^Prikazi i osvrti$",
-        "^Rodinný dům s provozovnou$",
-        "^Family house with an establishment$",
-        "^Shinsei chūtō shin kokugun$",
-        "^Pulmonary alveolar proteinosis(\\.?)$",
-        "^Shinshū kanbun$",
-        "^Viñeta(s?) de Rodríguez$",
-        "(?i)^RUBRIKA UREDNIKA$",
-        "^A Matching Model of the Academic Publication Market$",
-        "^Yōgaku kōyō$",
-        "^Internetový marketing$",
-        "^Internet marketing$",
-        "^Chūtō kokugo dokuhon$",
-        "^Kokugo dokuhon$",
-        "^Antibiotic Cover for Dental Extraction(s?)$",
-        "^Strategie podniku$",
-        "^Strategy of an Enterprise$",
-        "(?i)^respiratory disease(s?)(\\.?)$",
-        "^Award(s?) for Gallantry in Civil Defence$",
-        "^Podniková kultura$",
-        "^Corporate Culture$",
-        "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
-        "^Pracovní motivace$",
-        "^Work Motivation$",
-        "^Kaitei kōtō jogaku dokuhon$",
-        "^Konsolidovaná účetní závěrka$",
-        "^Consolidated Financial Statements$",
-        "(?i)^intracranial tumour(s?)$",
-        "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
-        "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
-        "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
-        "^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
-        "^Úroveň motivačního procesu jako způsobu vedení lidí$",
-        "^The level of motivation process as a leadership$",
-        "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
-        "(?i)^news and events$",
-        "(?i)^NOVOSTI I DOGAĐAJI$",
-        "^Sansū no gakushū$",
-        "^Posouzení informačního systému firmy a návrh změn$",
-        "^Information System Assessment and Proposal for ICT Modification$",
-        "^Stresové zatížení pracovníků ve vybrané profesi$",
-        "^Stress load in a specific job$",
-        "^Sunday: Poster Sessions, Pt.*$",
-        "^Monday: Poster Sessions, Pt.*$",
-        "^Wednesday: Poster Sessions, Pt.*",
-        "^Tuesday: Poster Sessions, Pt.*$",
-        "^Analýza reklamy$",
-        "^Analysis of advertising$",
-        "^Shōgaku shūshinsho$",
-        "^Shōgaku sansū$",
-        "^Shintei joshi kokubun$",
-        "^Taishō joshi kokubun dokuhon$",
-        "^Joshi kokubun$",
-        "^Účetní uzávěrka a účetní závěrka v ČR$",
-        "(?i)^The \"?Causes\"? of Cancer$",
-        "^Normas para la publicación de artículos$",
-        "^Editor('|s)(s|') [Rr]eply$",
-        "^Editor(’|s)(s|’) letter$",
-        "^Redaktoriaus žodis$",
-        "^DISCUSSION ON THE PRECEDING PAPER$",
-        "^Kōtō shōgaku shūshinsho jidōyō$",
-        "^Shōgaku nihon rekishi$",
-        "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
-        "^Préface$",
-        "^Occupational [Hh]ealth [Ss]ervices.$",
-        "^In Memoriam Professor Toshiyuki TAKESHIMA$",
-        "^Účetní závěrka ve vybraném podniku.*$",
-        "^Financial statements in selected company$",
-        "^Abdominal [Aa]ortic [Aa]neurysms.*$",
-        "^Pseudomyxoma peritonei$",
-        "^Kazalo autora$",
-        "(?i)^uvodna riječ$",
-        "^Motivace jako způsob vedení lidí$",
-        "^Motivation as a leadership$",
-        "^Polyfunkční dům$",
-        "^Multi\\-funkcional building$",
-        "^Podnikatelský plán$",
-        "(?i)^Podnikatelský záměr$",
-        "(?i)^Business Plan$",
-        "^Oceňování nemovitostí$",
-        "^Marketingová komunikace$",
-        "^Marketing communication$",
-        "^Sumario Analítico$",
-        "^Riječ uredništva$",
-        "^Savjetovanja i priredbe$",
-        "^Índice$",
-        "^(Starobosanski nadpisi).*$",
-        "^Vzdělávání pracovníků v organizaci$",
-        "^Staff training in organization$",
-        "^(Life Histories of North American Geometridae).*$",
-        "^Strategická analýza podniku$",
-        "^Strategic Analysis of an Enterprise$",
-        "^Sadržaj$",
-        "^Upute suradnicima$",
-        "^Rodinný dům$",
-        "(?i)^Fami(l)?ly house$",
-        "^Upute autorima$",
-        "^Strategic Analysis$",
-        "^Finanční analýza vybraného podniku$",
-        "^Finanční analýza$",
-        "^Riječ urednika$",
-        "(?i)^Content(s?)$",
-        "(?i)^Inhalt$",
-        "^Jinjō shōgaku shūshinsho jidōyō$",
-        "(?i)^Index$",
-        "^Chūgaku kokubun kyōkasho$",
-        "^Retrato de una mujer$",
-        "^Retrato de un hombre$",
-        "^Kōtō shōgaku dokuhon$",
-        "^Shotōka kokugo$",
-        "^Shōgaku dokuhon$",
-        "^Jinjō shōgaku kokugo dokuhon$",
-        "^Shinsei kokugo dokuhon$",
-        "^Teikoku dokuhon$",
-        "^Instructions to Authors$",
-        "^KİTAP TAHLİLİ$",
-        "^PRZEGLĄD PIŚMIENNICTWA$",
-        "(?i)^Presentación$",
-        "^İçindekiler$",
-        "(?i)^Tabl?e of contents$",
-        "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
-        "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
-        "^Editorial( Board)?$",
-        "(?i)^Editorial \\(English\\)$",
-        "^Editörden$",
-        "^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
-        "^(Kiri Karl Morgensternile).*$",
-        "^(\\[Eksliibris Aleksandr).*\\]$",
-        "^(\\[Eksliibris Aleksandr).*$",
-        "^(Eksliibris Aleksandr).*$",
-        "^(Kiri A\\. de Vignolles).*$",
-        "^(2 kirja Karl Morgensternile).*$",
-        "^(Pirita kloostri idaosa arheoloogilised).*$",
-        "^(Kiri tundmatule).*$",
-        "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
-        "^(Eksliibris Nikolai Birukovile).*$",
-        "^(Eksliibris Nikolai Issakovile).*$",
-        "^(WHP Cruise Summary Information of section).*$",
-        "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
-        "^(Measurement of the spin\\-dependent structure function).*",
-        "(?i)^.*authors['’′]? reply\\.?$",
-        "(?i)^.*authors['’′]? response\\.?$",
-        "^Data [mM]anagement [sS]ervices\\.$",
-        "Research and Advanced Technology for Digital Libraries",
-        "Food and Nutrition"
-      ]
-    },
-    "synonyms": {}
-  }
-}
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/sw.tree.conf.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/sw.tree.conf.json
@ -1,150 +0,0 @@
-{
-  "wf" : {
-    "threshold" : "0.99",
-    "dedupRun" : "001",
-    "entityType" : "result",
-    "subEntityType" : "resulttype",
-    "subEntityValue" : "software",
-    "orderField" : "title",
-    "queueMaxSize" : "200",
-    "groupMaxSize" : "100",
-    "maxChildren" : "100",
-    "slidingWindowSize" : "50",
-    "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
-    "includeChildren" : "true"
-  },
-  "pace" : {
-    "clustering" : [
-      { "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } },
-      { "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid":"0"} },
-      { "name" : "ngrams", "fields" : [ "title" ], "params" : {"ngramLen": 3, "max": 4, "maxPerToken":1, "minNgramLen":3}},
-      { "name" : "urlclustering", "fields": [ "url" ], "params" : {}}
-    ],
-    "decisionTree": {
-      "start": {
-        "fields": [
-          {
-            "field": "doi",
-            "comparator": "exactMatch",
-            "weight": 1,
-            "countIfUndefined": "false",
-            "params": {}
-          }
-        ],
-        "threshold": 1,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "titleCheck",
-        "undefined": "titleCheck",
-        "ignoreUndefined": "false"
-      },
-      "titleCheck": {
-        "fields": [
-          {
-            "field": "title",
-            "comparator": "levensteinTitleIgnoreVersion",
-            "weight": 1,
-            "countIfUndefined": "false",
-            "params": {}
-          }
-        ],
-        "threshold": 0.95,
-        "aggregation": "AVG",
-        "positive": "pidCheck",
-        "negative": "NO_MATCH",
-        "undefined": "NO_MATCH",
-        "ignoreUndefined": "false"
-      },
-      "pidCheck": {
-        "fields": [
-          {
-            "field": "altdoi",
-            "comparator": "exactMatch",
-            "weight": 1,
-            "countIfUndefined": "false",
-            "params": {}
-          },
-          {
-            "field": "doi",
-            "comparator": "exactMatch",
-            "weight": 1,
-            "countIfUndefined": "false",
-            "params": {"crossCompare": "altdoi"}
-          },
-          {
-            "field": "url",
-            "comparator": "exactMatch",
-            "weight": 1,
-            "countIfUndefined": "false",
-            "params": {}
-          }
-        ],
-        "threshold": 1,
-        "aggregation": "OR",
-        "positive": "MATCH",
-        "negative": "authorsCheck",
-        "undefined": "authorsCheck",
-        "ignoreUndefined": "false"
-      },
-      "authorsCheck": {
-        "fields": [
-          {
-            "field": "authors",
-            "comparator": "authorsMatch",
-            "weight": 1.0,
-            "countIfUndefined": "false",
-            "params": {
-              "surname_th": 0.70,
-              "fullname_th": 0.70,
-              "size_th": 20,
-              "mode": "surname"
-            }
-          }
-        ],
-        "threshold": 1,
-        "aggregation": "AVG",
-        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "MATCH",
-        "ignoreUndefined": "false"
-      }
-    },
-    "model" : [
-      {
-        "name" : "doi",
-        "type" : "String",
-        "path" : "$.instance.pid[?(@.qualifier.classid == 'doi')].value"
-      },
-      {
-        "name" : "altdoi",
-        "type" : "String",
-        "path" : "$.instance.alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
-      },
-      {
-        "name" : "title",
-        "type" : "String",
-        "path" : "$.title[?(@.qualifier.classid == 'main title')].value",
-        "length" : 250,
-        "size" : 5
-      },
-      {
-        "name" : "url",
-        "type" : "String",
-        "path" : "$.instance.url"
-      },
-      {
-        "name" : "resulttype",
-        "type" : "String",
-        "path" : "$.resulttype.classid"
-      },
-      {
-        "name": "authors",
-        "type": "List",
-        "path": "$.author[*].fullname",
-        "size": 200
-      }
-    ],
-    "blacklists" : {},
-    "synonyms": {}
-  }
-}
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/authors.dump.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/authors.dump.json
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/ds.to.fix.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/ds.to.fix.json
@ -1,4 +0,0 @@
-{"websiteurl": "https://fairsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "eosc________::oxford_e-research_centre::oxford_e-research_centre.fairsharing"}
-{"websiteurl": "https://FAIRsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "fairsharing_::2521"}
-{"websiteurl": "https://fairsharing.org/", "englishname": "formerly: biosharing", "officialname": "FAIRsharing", "id": "re3data_____::r3d100010142"}
-{"websiteurl": "https://fairsharing.org/", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "openaire____::fairsharing"}
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/openorgs.to.fix.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/openorgs.to.fix.json
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/publications.to.fix.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/publications.to.fix.json
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/software.dump.2000.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/software.dump.2000.json
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/software.to.fix.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/software.to.fix.json
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/softwares.dump.2000.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/softwares.dump.2000.json
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/computeStatistics_parameters.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/computeStatistics_parameters.json
@ -1,32 +0,0 @@
-[
-  {
-    "paramName": "e",
-    "paramLongName": "entitiesPath",
-    "paramDescription": "the input entities",
-    "paramRequired": true
-  },
-  {
-    "paramName": "w",
-    "paramLongName": "workingPath",
-    "paramDescription": "path of the working directory",
-    "paramRequired": true
-  },
-  {
-    "paramName": "np",
-    "paramLongName": "numPartitions",
-    "paramDescription": "number of partitions for the similarity relations intermediate phases",
-    "paramRequired": false
-  },
-  {
-    "paramName": "dc",
-    "paramLongName": "dedupConfPath",
-    "paramDescription": "dedup configuration to be used",
-    "paramRequired": false
-  },
-  {
-    "paramName": "gt",
-    "paramLongName": "groundTruthFieldJPath",
-    "paramDescription": "field to be used as groundtruth",
-    "paramRequired": false
-  }
-]
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createDedupEntity_parameters.json
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/parameters/createDedupEntity_parameters.json
--- a/dnet-pace-core/pom.xml
+++ b/dnet-pace-core/pom.xml
@ -6,7 +6,7 @@
 	<parent>
 		<groupId>eu.dnetlib</groupId>
 		<artifactId>dnet-dedup</artifactId>
-		<version>4.1.13-SNAPSHOT</version>
+		<version>4.1.9</version>
        <relativePath>../pom.xml</relativePath>
 	</parent>

@ -67,11 +67,6 @@
 			<artifactId>json-path</artifactId>
 		</dependency>

-		<dependency>
-			<groupId>com.ibm.icu</groupId>
-			<artifactId>icu4j</artifactId>
-		</dependency>
-
 	</dependencies>

 </project>
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java
@ -1,59 +1,59 @@
 package eu.dnetlib.pace.clustering;

-import com.google.common.collect.Maps;
-import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Document;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldListImpl;
-import eu.dnetlib.pace.model.MapDocument;
-
 import java.util.Collection;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
-import java.util.regex.Pattern;
+import java.util.Set;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Document;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldListImpl;
+import eu.dnetlib.pace.model.MapDocument;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;

 public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {

-    public static Collection<String> filterAndCombine(final MapDocument a, final Config conf) {
-        Document filtered = filter(a, conf.blacklists());
-        return combine(filtered, conf);
-    }
+	private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombiner.class);

-    private static MapDocument filter(final MapDocument a, final Map<String, List<Pattern>> blacklists) {
-        if (blacklists == null || blacklists.isEmpty()) {
-            return a;
-        }
+	public static Collection<String> filterAndCombine(final MapDocument a, final Config conf) {

-        final Map<String, Field> filtered = Maps.newHashMap(a.getFieldMap());
+		final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, conf.blacklists());
+		return combine(filtered, conf);
+	}

-        for (final Entry<String, List<Pattern>> e : blacklists.entrySet()) {
-            Field fields = a.getFieldMap().get(e.getKey());
-            if (fields != null) {
-                final FieldListImpl fl = new FieldListImpl();
+	private MapDocument filter(final MapDocument a, final Map<String, List<String>> blacklists) {
+		final Map<String, Field> filtered = Maps.newHashMap(a.getFieldMap());
+		if (blacklists != null) {
+			for (final Entry<String, Field> e : filtered.entrySet()) {

-                for (Field f : fields) {
-                    if (!isBlackListed(f.stringValue(), e.getValue())) {
-                        fl.add(f);
-                    }
-                }
-
-                filtered.put(e.getKey(), fl);
-            }
-        }
-
-        return new MapDocument(a.getIdentifier(), filtered);
-    }
-
-    private static boolean isBlackListed(String value, List<Pattern> blacklist) {
-        for (Pattern pattern : blacklist) {
-            if (pattern.matcher(value).matches()) {
-                return true;
-            }
-        }
-
-        return false;
-    }
+				final FieldListImpl fl = new FieldListImpl();
+				fl.addAll(Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists))));
+				filtered.put(e.getKey(), fl);
+			}
+		}
+		return new MapDocument(a.getIdentifier(), filtered);
+	}

+	/**
+	 * Tries to match the fields in the regex blacklist.
+	 *
+	 * @param fieldName
+	 * @param value
+	 * @return true if the field matches, false otherwise
+	 */
+	protected boolean regexMatches(final String fieldName, final String value, final Map<String, Set<String>> blacklists) {
+		if (blacklists.containsKey(fieldName)) {
+			for (final String regex : blacklists.get(fieldName)) {
+				if (value.matches(regex)) return true;
+			}
+		}
+		return false;
+	}
 }
-
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java
@ -20,6 +20,10 @@ public class ClusteringCombiner {
 	private static String COLLAPSE_ON= "collapseOn";

 	public static Collection<String> combine(final Document a, final Config conf) {
+		return new ClusteringCombiner().doCombine(a, conf);
+	}
+
+	private Collection<String> doCombine(final Document a, final Config conf) {
 		final Collection<String> res = Sets.newLinkedHashSet();
 		for (final ClusteringDef cd : conf.clusterings()) {
 			for (final String fieldName : cd.getFields()) {
@ -47,7 +51,7 @@ public class ClusteringCombiner {
 		return res;
 	}

-	private static String getPrefix(ClusteringDef cd, String fieldName) {
+	private String getPrefix(ClusteringDef cd, String fieldName) {
 		return cd.getName()+ SEPARATOR +
 				cd.getParams().keySet()
 						.stream()
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java
@ -0,0 +1,48 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.base.Predicate;
+
+import eu.dnetlib.pace.model.Field;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+public class FieldFilter implements Predicate<Field> {
+
+	private static final Log log = LogFactory.getLog(FieldFilter.class);
+
+	private Map<String, List<String>> blacklists;
+
+	private String filedName;
+
+	public FieldFilter(final String fieldName, final Map<String, List<String>> blacklists) {
+		this.filedName = fieldName;
+		this.blacklists = blacklists;
+	}
+
+	@Override
+	public boolean apply(final Field f) {
+		return !regexMatches(filedName, f.stringValue(), blacklists);
+	}
+
+	/**
+	 * Tries to match the fields in the regex blacklist.
+	 *
+	 * @param fieldName
+	 * @param value
+	 * @return true if the field matches, false otherwise
+	 */
+	protected boolean regexMatches(final String fieldName, final String value, final Map<String, List<String>> blacklists) {
+		if (blacklists.containsKey(fieldName)) {
+			final Iterable<String> regexes = blacklists.get(fieldName);
+			for (final String regex : regexes) {
+				if (StringUtils.isBlank(regex)) return false;
+				if (value.matches(regex)) return true;
+			}
+		}
+		return false;
+	}
+}
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java
@ -41,7 +41,7 @@ public class KeywordsClustering extends AbstractClusteringFunction {
    public Collection<String> apply(final Config conf, List<Field> fields) {
        return fields.stream().filter(f -> !f.isEmpty())
                .map(Field::stringValue)
-                .map(this::cleanup)
+                .map(this::cleanup) //TODO can I add this to the AbstractClusteringFunction without overriding the method here?
                .map(this::normalize)
                .map(s -> filterAllStopWords(s))
                .map(s -> doApply(conf, s))
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
@ -1,77 +0,0 @@
-package eu.dnetlib.pace.clustering;
-
-import com.google.common.collect.Lists;
-import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.Person;
-import org.apache.commons.lang3.StringUtils;
-
-import java.util.*;
-import java.util.stream.Collectors;
-
-@ClusteringClass("lnfi")
-public class LastNameFirstInitial extends AbstractClusteringFunction{
-
-    private boolean DEFAULT_AGGRESSIVE = true;
-
-    public LastNameFirstInitial(final Map<String, Integer> params) {
-        super(params);
-    }
-
-    @Override
-    public Collection<String> apply(Config conf, List<Field> fields) {
-        return fields.stream().filter(f -> !f.isEmpty())
-                .map(Field::stringValue)
-                .map(this::normalize)
-                .map(s -> doApply(conf, s))
-                .map(c -> filterBlacklisted(c, ngramBlacklist))
-                .flatMap(c -> c.stream())
-                .filter(StringUtils::isNotBlank)
-                .collect(Collectors.toCollection(HashSet::new));
-    }
-
-    @Override
-    protected String normalize(final String s) {
-        return fixAliases(transliterate(nfd(unicodeNormalization(s))))
-                // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
-                .replaceAll("[^ \\w]+", "")
-                .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
-                .replaceAll("(\\p{Punct})+", " ")
-                .replaceAll("(\\d)+", " ")
-                .replaceAll("(\\n)+", " ")
-                .trim();
-    }
-
-    @Override
-    protected Collection<String> doApply(final Config conf, final String s) {
-
-        final List<String> res = Lists.newArrayList();
-
-        final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
-
-        Person p = new Person(s, aggressive);
-
-        if (p.isAccurate()) {
-            String lastName = p.getNormalisedSurname().toLowerCase();
-            String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
-
-            res.add(firstInitial.concat(lastName));
-        }
-        else {  // is not accurate, meaning it has no defined name and surname
-            List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
-            if (fullname.size() == 1) {
-                res.add(p.getNormalisedFullname().toLowerCase());
-            }
-            else if (fullname.size() == 2) {
-                res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
-                res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
-            }
-            else {
-                res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
-                res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
-            }
-        }
-
-        return res;
-    }
-}
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
@ -9,7 +9,7 @@ import com.google.common.collect.Lists;
 import eu.dnetlib.pace.config.Config;
 import eu.dnetlib.pace.model.Person;

-@ClusteringClass("personHash")
+@ClusteringClass("personhash")
 public class PersonHash extends AbstractClusteringFunction {

 	private boolean DEFAULT_AGGRESSIVE = false;
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@ -3,20 +3,24 @@ package eu.dnetlib.pace.common;
 import com.google.common.base.Joiner;
 import com.google.common.base.Splitter;
 import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
 import com.google.common.collect.Sets;
-import com.ibm.icu.text.Transliterator;
 import eu.dnetlib.pace.clustering.NGramUtils;
+import eu.dnetlib.pace.config.Type;
 import eu.dnetlib.pace.model.Field;
 import eu.dnetlib.pace.model.FieldList;
 import eu.dnetlib.pace.model.FieldListImpl;
+import eu.dnetlib.pace.model.FieldValueImpl;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;

 import java.io.IOException;
 import java.io.StringWriter;
+import java.io.UnsupportedEncodingException;
 import java.nio.charset.StandardCharsets;
 import java.text.Normalizer;
 import java.util.*;
+import java.util.function.Function;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
@ -32,7 +36,6 @@ public abstract class AbstractPaceFunctions {
    private static Map<String, String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");

    //list of stopwords in different languages
-    protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
    protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
    protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
    protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
@ -40,9 +43,6 @@ public abstract class AbstractPaceFunctions {
    protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
    protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");

-    //transliterator
-    protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
-
    //blacklist of ngrams: to avoid generic keys
    protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");

@ -68,13 +68,15 @@ public abstract class AbstractPaceFunctions {

    protected String cleanup(final String s) {

-        final String s1 = s.replaceAll(HTML_REGEX, "");
-        final String s2 = unicodeNormalization(s1.toLowerCase());
-        final String s3 = nfd(s2);
-        final String s4 = fixXML(s3);
-        final String s5 = s4.replaceAll("([0-9]+)", " $1 ");
-        final String s6 = transliterate(s5);
-        final String s7 = fixAliases(s6);
+        final String s00 = s.replaceAll(HTML_REGEX, "");
+        final String s0 = unicodeNormalization(s00.toLowerCase());
+        final String s1 = fixAliases(s0);
+        final String s2 = nfd(s1);
+        final String s3 = s2.replaceAll("&ndash;", " ");
+        final String s4 = s3.replaceAll("&amp;", " ");
+        final String s5 = s4.replaceAll("&quot;", " ");
+        final String s6 = s5.replaceAll("&minus;", " ");
+        final String s7 = s6.replaceAll("([0-9]+)", " $1 ");
        final String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
        final String s9 = s8.replaceAll("[\\p{Punct}]", " ");
        final String s10 = s9.replaceAll("\\n", " ");
@ -83,14 +85,6 @@ public abstract class AbstractPaceFunctions {
        return s12;
    }

-    protected String fixXML(final String a){
-
-        return a.replaceAll("&ndash;", " ")
-                .replaceAll("&amp;", " ")
-                .replaceAll("&quot;", " ")
-                .replaceAll("&minus;", " ");
-    }
-
    protected boolean checkNumbers(final String a, final String b) {
        final String numbersA = getNumbers(a);
        final String numbersB = getNumbers(b);
@ -128,31 +122,19 @@ public abstract class AbstractPaceFunctions {

    protected static String fixAliases(final String s) {
        final StringBuilder sb = new StringBuilder();
-
-        s.chars().forEach(ch -> {
+        for (final char ch : Lists.charactersOf(s)) {
            final int i = StringUtils.indexOf(aliases_from, ch);
-            sb.append(i >= 0 ? aliases_to.charAt(i) : (char)ch);
-        });
-
+            sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
+        }
        return sb.toString();
    }

-    protected static String transliterate(final String s) {
-        try {
-            return transliterator.transliterate(s);
-        }
-        catch(Exception e) {
-            return s;
-        }
-    }
-
    protected String removeSymbols(final String s) {
        final StringBuilder sb = new StringBuilder();

-        s.chars().forEach(ch -> {
-            sb.append(StringUtils.contains(alpha, ch) ? (char)ch : ' ');
-        });
-
+        for (final char ch : Lists.charactersOf(s)) {
+            sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
+        }
        return sb.toString().replaceAll("\\s+", " ");
    }

@ -165,7 +147,7 @@ public abstract class AbstractPaceFunctions {
    }

    protected String normalize(final String s) {
-        return fixAliases(transliterate(nfd(unicodeNormalization(s))))
+        return nfd(unicodeNormalization(s))
                .toLowerCase()
                // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
                .replaceAll("[^ \\w]+", "")
@ -218,7 +200,6 @@ public abstract class AbstractPaceFunctions {
        s = filterStopWords(s, stopwords_fr);
        s = filterStopWords(s, stopwords_pt);
        s = filterStopWords(s, stopwords_es);
-        s = filterStopWords(s, stopwords_gr);

        return s;
    }
@ -234,13 +215,10 @@ public abstract class AbstractPaceFunctions {
    }

    public static Set<String> loadFromClasspath(final String classpath) {
-
-        Transliterator transliterator = Transliterator.getInstance("Any-Eng");
-
        final Set<String> h = Sets.newHashSet();
        try {
-            for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
-                h.add(fixAliases(transliterator.transliterate(s))); //transliteration of the stopwords
+            for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
+                h.add(s);
            }
        } catch (final Throwable e) {
            return Sets.newHashSet();
@ -249,17 +227,14 @@ public abstract class AbstractPaceFunctions {
    }

    public static Map<String, String> loadMapFromClasspath(final String classpath) {
-
-        Transliterator transliterator = Transliterator.getInstance("Any-Eng");
-
        final Map<String, String> m = new HashMap<>();
        try {
-            for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
+            for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
                //string is like this: code;word1;word2;word3
                String[] line = s.split(";");
                String value = line[0];
                for (int i = 1; i < line.length; i++) {
-                    m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value);
+                    m.put(line[i].toLowerCase(), value);
                }
            }
        } catch (final Throwable e) {
@ -347,7 +322,7 @@ public abstract class AbstractPaceFunctions {
    public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
        final StringWriter sw = new StringWriter();
        try {
-            IOUtils.copy(clazz.getResourceAsStream(filename), sw, StandardCharsets.UTF_8);
+            IOUtils.copy(clazz.getResourceAsStream(filename), sw);
            return sw.toString();
        } catch (final IOException e) {
            throw new RuntimeException("cannot load resource from classpath: " + filename);
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
@ -2,7 +2,6 @@ package eu.dnetlib.pace.config;

 import java.util.List;
 import java.util.Map;
-import java.util.regex.Pattern;

 import eu.dnetlib.pace.model.ClusteringDef;
 import eu.dnetlib.pace.model.FieldDef;
@ -48,7 +47,7 @@ public interface Config {
 	 *
 	 * @return the map
 	 */
-	public Map<String, List<Pattern>> blacklists();
+	public Map<String, List<String>> blacklists();


 	/**
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
@ -1,6 +1,5 @@
 package eu.dnetlib.pace.config;

-import com.fasterxml.jackson.annotation.JsonIgnore;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.collect.Maps;
 import eu.dnetlib.pace.model.ClusteringDef;
@ -8,19 +7,15 @@ import eu.dnetlib.pace.model.FieldDef;
 import eu.dnetlib.pace.util.PaceException;
 import org.antlr.stringtemplate.StringTemplate;
 import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;

 import java.io.IOException;
 import java.io.Serializable;
-import java.nio.charset.StandardCharsets;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;


 import eu.dnetlib.pace.tree.support.TreeNodeDef;
@ -36,9 +31,6 @@ public class DedupConfig implements Config, Serializable {

 	private WfConfig wf;

-	@JsonIgnore
-	private Map<String, List<Pattern>> blacklists;
-
 	private static Map<String, String> defaults = Maps.newHashMap();

 	static {
@ -65,12 +57,6 @@ public class DedupConfig implements Config, Serializable {
 			config = new ObjectMapper().readValue(json, DedupConfig.class);
 			config.getPace().initModel();
 			config.getPace().initTranslationMap();
-
-			config.blacklists = config.getPace().getBlacklists().entrySet()
-					.stream()
-					.collect(Collectors.toMap(e -> e.getKey(),
-							e ->e.getValue().stream().filter(s -> !StringUtils.isBlank(s)).map(Pattern::compile).collect(Collectors.toList()) ));
-
 			return config;
 		} catch (IOException e) {
 			throw new PaceException("Error in parsing configuration json", e);
@ -102,7 +88,7 @@ public class DedupConfig implements Config, Serializable {
 	}

 	private String readFromClasspath(final String resource) throws IOException {
-		return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8);
+		return IOUtils.toString(getClass().getResource(resource));
 	}

 	public PaceConfig getPace() {
@ -151,8 +137,8 @@ public class DedupConfig implements Config, Serializable {
 	}

 	@Override
-	public Map<String, List<Pattern>> blacklists() {
-		return blacklists;
+	public Map<String, List<String>> blacklists() {
+		return getPace().getBlacklists();
 	}

 	@Override
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
@ -3,7 +3,6 @@ package eu.dnetlib.pace.config;

 import com.fasterxml.jackson.annotation.JsonIgnore;
 import com.google.common.collect.Maps;
-import com.ibm.icu.text.Transliterator;
 import eu.dnetlib.pace.common.AbstractPaceFunctions;
 import eu.dnetlib.pace.model.ClusteringDef;
 import eu.dnetlib.pace.model.FieldDef;
@ -44,12 +43,10 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable {

 	public void initTranslationMap(){
 		translationMap = Maps.newHashMap();
-
-		Transliterator transliterator = Transliterator.getInstance("Any-Eng");
 		for (String key : synonyms.keySet()) {
 			for (String term : synonyms.get(key)){
 				translationMap.put(
-						fixAliases(transliterator.transliterate(term.toLowerCase())),
+						normalize(term.toLowerCase()),
 				key);
 			}
 		}
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java
@ -1,5 +1,5 @@
 package eu.dnetlib.pace.config;

 public enum Type {
-	String, Int, List, JSON, URL, StringConcat, DoubleArray
+	String, Int, List, JSON, URL
 }
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java
@ -20,6 +20,4 @@ public interface FieldValue extends Field {
 	 */
 	public void setValue(final Object value);

-	public double[] doubleArrayValue();
-
 }
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java
@ -58,10 +58,8 @@ public class FieldValueImpl extends AbstractField implements FieldValue {
 					throw new RuntimeException(value.toString());
 				}
 			case URL:
-				String str = value.toString();
-				return StringUtils.isBlank(str) || !isValidURL(str);
-			case DoubleArray:
-				return doubleArrayValue().length==0;
+			String str = value.toString();
+			return StringUtils.isBlank(str) || !isValidURL(str);
 		default:
 			return true;
 		}
@ -118,10 +116,6 @@ public class FieldValueImpl extends AbstractField implements FieldValue {
 		// }
 	}

-	public double[] doubleArrayValue() {
-		return (double[])getValue();
-	}
-
 	/*
 	 * (non-Javadoc)
 	 * 
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
@ -43,7 +43,7 @@ public class Person {
 			// s = s.replaceAll("[\\W&&[^,-]]", "");
 		}

-		if (s.contains(",")) {	//if the name contains a comma it is easy derivable the name and the surname
+		if (s.contains(",")) {
 			final String[] arr = s.split(",");
 			if (arr.length == 1) {
 				fullname = splitTerms(arr[0]);
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
@ -25,8 +25,6 @@ public class AuthorsMatch extends AbstractComparator {
    private double NAME_THRESHOLD;
    private double FULLNAME_THRESHOLD;
    private String MODE; //full or surname
-    private int SIZE_THRESHOLD;
-    private String TYPE; //count or percentage
    private int common;

    public AuthorsMatch(Map<String, String> params){
@ -37,8 +35,6 @@ public class AuthorsMatch extends AbstractComparator {
        SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
        NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
        FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
-        SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
-        TYPE = params.getOrDefault("type", "percentage");
        common = 0;
    }

@ -52,9 +48,6 @@ public class AuthorsMatch extends AbstractComparator {
        if (a.isEmpty() || b.isEmpty())
            return -1;

-        if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) b).size() > SIZE_THRESHOLD)
-            return 1.0;
-
        List<Person> aList = ((FieldList) a).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
        List<Person> bList = ((FieldList) b).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());

@ -67,10 +60,7 @@ public class AuthorsMatch extends AbstractComparator {
                //both persons are inaccurate
                if (!p1.isAccurate() && !p2.isAccurate()) {
                    //compare just normalized fullnames
-                    String fullname1 = normalization(p1.getNormalisedFullname().isEmpty()? p1.getOriginal() : p1.getNormalisedFullname());
-                    String fullname2 = normalization(p2.getNormalisedFullname().isEmpty()? p2.getOriginal() : p2.getNormalisedFullname());
-
-                    if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) {
+                    if (ssalgo.score(normalization(p1.getNormalisedFullname()), normalization(p2.getNormalisedFullname())) > FULLNAME_THRESHOLD) {
                        common += 1;
                        break;
                    }
@ -79,14 +69,10 @@ public class AuthorsMatch extends AbstractComparator {
                //one person is inaccurate
                if (p1.isAccurate() ^ p2.isAccurate()) {
                    //prepare data
-                    //data for the accurate person
-                    String name = normalization(p1.isAccurate()? p1.getNormalisedFirstName() : p2.getNormalisedFirstName());
-                    String surname = normalization(p1.isAccurate()? p1.getNormalisedSurname() : p2.getNormalisedSurname());
+                    String name = p1.isAccurate()? normalization(p1.getNormalisedFirstName()) : normalization(p2.getNormalisedFirstName());
+                    String surname = p1.isAccurate()? normalization(p2.getNormalisedSurname()) : normalization(p2.getNormalisedSurname());

-                    //data for the inaccurate person
-                    String fullname = normalization(
-                            p1.isAccurate() ? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname()) : (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname())
-                    );
+                    String fullname = p1.isAccurate()? normalization(p2.getNormalisedFullname()) : normalization(p1.getNormalisedFullname());

                    if (fullname.contains(surname)) {
                        if (MODE.equals("full")) {
@ -125,12 +111,7 @@ public class AuthorsMatch extends AbstractComparator {
        //normalization factor to compute the score
        int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);

-        if(TYPE.equals("percentage")) {
-            return (double) common / normFactor;
-        }
-        else {
-            return (double) common;
-        }
+        return (double)common / normFactor;
    }

    public boolean compareSurname(Person p1, Person p2) {
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java
@ -1,53 +0,0 @@
-package eu.dnetlib.pace.tree;
-
-import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldList;
-import eu.dnetlib.pace.model.FieldValueImpl;
-import eu.dnetlib.pace.model.Person;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
-import eu.dnetlib.pace.tree.support.ComparatorClass;
-
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.stream.Collectors;
-
-@ComparatorClass("cosineSimilarity")
-public class CosineSimilarity extends AbstractComparator {
-
-    Map<String, String> params;
-
-    public CosineSimilarity(Map<String,String> params) {
-        super(params);
-    }
-
-    @Override
-    public double compare(final Field a, final Field b, final Config conf) {
-
-        if (a.isEmpty() || b.isEmpty())
-            return -1;
-
-        double[] aVector = ((FieldValueImpl) a).doubleArrayValue();
-        double[] bVector = ((FieldValueImpl) b).doubleArrayValue();
-
-        return cosineSimilarity(aVector, bVector);
-    }
-
-    double cosineSimilarity(double[] a, double[] b) {
-        double dotProduct = 0;
-        double normASum = 0;
-        double normBSum = 0;
-
-        for(int i = 0; i < a.length; i ++) {
-            dotProduct += a[i] * b[i];
-            normASum += a[i] * a[i];
-            normBSum += b[i] * b[i];
-        }
-
-        double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum);
-        return dotProduct / eucledianDist;
-    }
-
-
-}
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java
@ -16,7 +16,6 @@ public class DomainExactMatch extends ExactMatchIgnoreCase {

    @Override
    protected String getValue(final Field f) {
-
        try {
            return asUrl(super.getValue(f)).getHost();
        } catch (MalformedURLException e) {
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java
@ -50,9 +50,6 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
        cb = removeKeywords(cb, keywords2);
        cb = removeKeywords(cb, cities2);

-        ca = ca.replaceAll("[ ]{2,}", " ");
-        cb = cb.replaceAll("[ ]{2,}", " ");
-
        if (ca.isEmpty() && cb.isEmpty())
            return 1.0;
        else
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java
@ -1,34 +0,0 @@
-package eu.dnetlib.pace.tree;
-
-import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
-import eu.dnetlib.pace.tree.support.ComparatorClass;
-
-import java.util.Map;
-
-@ComparatorClass("numbersComparator")
-public class NumbersComparator extends AbstractComparator {
-
-    Map<String, String> params;
-
-    public NumbersComparator(Map<String, String> params) {
-        super(params);
-        this.params = params;
-    }
-
-    @Override
-    public double distance(String a, String b, Config conf) {
-
-        //extracts numbers from the field
-        String numbers1 = getNumbers(nfd(a));
-        String numbers2 = getNumbers(nfd(b));
-
-        if (numbers1.isEmpty() || numbers2.isEmpty())
-            return -1.0;
-
-        int n1 = Integer.parseInt(numbers1);
-        int n2 = Integer.parseInt(numbers2);
-
-        return Math.abs(n1 - n2);
-    }
-}
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java
@ -42,25 +42,22 @@ public class StringContainsMatch extends AbstractComparator {
            STRING = STRING.toLowerCase();
        }

-        if (AGGREGATOR != null) {
-            switch (AGGREGATOR) {
-                case "AND":
-                    if (ca.contains(STRING) && cb.contains(STRING))
-                        return 1.0;
-                    break;
-                case "OR":
-                    if (ca.contains(STRING) || cb.contains(STRING))
-                        return 1.0;
-                    break;
-                case "XOR":
-                    if (ca.contains(STRING) ^ cb.contains(STRING))
-                        return 1.0;
-                    break;
-                default:
-                    return 0.0;
-            }
+        switch(AGGREGATOR) {
+            case "AND":
+                if(ca.contains(STRING) && cb.contains(STRING))
+                    return 1.0;
+                break;
+            case "OR":
+                if(ca.contains(STRING) || cb.contains(STRING))
+                    return 1.0;
+                break;
+            case "XOR":
+                if(ca.contains(STRING) ^ cb.contains(STRING))
+                    return 1.0;
+                break;
+            default:
+                return 0.0;
        }
-
        return 0.0;
    }
 }
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java
@ -19,13 +19,9 @@ public class StringListMatch extends AbstractComparator {
    private static final Log log = LogFactory.getLog(StringListMatch.class);
    private Map<String, String> params;

-    final private String TYPE; //percentage or count
-
    public StringListMatch(final Map<String, String> params) {
        super(params);
        this.params = params;
-
-        TYPE = params.getOrDefault("type", "percentage");
    }

    @Override
@ -35,7 +31,7 @@ public class StringListMatch extends AbstractComparator {
        final Set<String> pb = new HashSet<>(((FieldList) b).stringList());

        if (pa.isEmpty() || pb.isEmpty()) {
-            return -1;  //return undefined if one of the two lists is empty
+            return -1;  //return undefined if one of the two lists of pids is empty
        }

        int incommon = Sets.intersection(pa, pb).size();
@ -45,10 +41,7 @@ public class StringListMatch extends AbstractComparator {
            return 0.0;
        }

-        if(TYPE.equals("percentage"))
-            return (double)incommon / (incommon + simDiff);
-        else
-            return incommon;
+        return (double)incommon / (incommon + simDiff);

    }
 }
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java
@ -1,6 +1,5 @@
 package eu.dnetlib.pace.tree.support;

-import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.pace.config.Config;
 import eu.dnetlib.pace.config.PaceConfig;
@ -10,7 +9,6 @@ import org.apache.commons.lang3.StringUtils;

 import java.io.IOException;
 import java.io.Serializable;
-import java.io.StringWriter;
 import java.util.List;

 public class TreeNodeDef implements Serializable {
@ -59,9 +57,8 @@ public class TreeNodeDef implements Serializable {
                double result2 = comparator(fieldConf).compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf);
                result = Math.max(result1,result2);
            }
-            else {
+            else
                result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
-            }

            stats.addFieldStats(
                    fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java
@ -161,14 +161,11 @@ public class BlockProcessorForTesting {
                            }
                            else {
                                //use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications)
-                                if (useTree)
+                                if(useTree)
                                    emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
                                else
                                    emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
                            }
-//                            if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) {
-//                                emitOutput(true, idPivot, idCurr, context);
-//                            }

                        }
                    }
@ -183,45 +180,38 @@ public class BlockProcessorForTesting {
        return compare>=1.0;
    }

-    private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
-        //if the score gives 1, the publications are equivalent
-        Map<String, String> params = new HashMap<>();
-        params.put("jpath_value", "$.value");
-        params.put("jpath_classid", "$.qualifier.classid");
-        params.put("mode", "count");
+        private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {

-        double score = 0.0;
+            double score = 0.0;
+            //LAYER 1 - comparison of the PIDs json lists
+            Map<String, String> params = new HashMap<>();
+            params.put("jpath_value", "$.value");
+            params.put("jpath_classid", "$.qualifier.classid");
+            JsonListMatch jsonListMatch = new JsonListMatch(params);
+            double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config);
+            if (result >= 0.5) //if the result of the comparison is greater than the threshold
+                score += 10.0;  //high score because it should match when the first condition is satisfied
+            else
+                score += 0.0;

-        //levenstein title
-        LevensteinTitle levensteinTitle = new LevensteinTitle(params);
-        if(levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) {
-            score += 0.2;
+            //LAYER 2 - comparison of the title version and the size of the authors lists
+            TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
+            double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
+            SizeMatch sizeMatch = new SizeMatch(params);
+            double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
+            if (Math.min(result1, result2) != 0)
+                score+=0;
+            else
+                score-=2;
+
+            //LAYER 3 - computation of levenshtein on titles
+            LevensteinTitle levensteinTitle = new LevensteinTitle(params);
+            double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
+            score += Double.isNaN(result3)?0.0:result3;
+
+            return score >= 0.99;
        }

-        //pid
-        JsonListMatch jsonListMatch = new JsonListMatch(params);
-        if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) {
-            score += 0.5;
-        }
-
-        //title version
-        TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
-        double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
-        if(result1<0 || result1>=1.0) {
-            score += 0.1;
-        }
-
-        //authors match
-        params.remove("mode");
-        AuthorsMatch authorsMatch = new AuthorsMatch(params);
-        double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
-        if(result2 <0|| result2>=0.6) {
-            score += 0.2;
-        }
-
-        return score>=0.5;
-    }
-
        private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context)  {

            if (result) {
@ -244,5 +234,6 @@ public class BlockProcessorForTesting {
            final String type = dedupConf.getWf().getEntityType();

            context.emit(type, from, to);
+            context.emit(type, to, from);
        }
 }
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java
@ -7,13 +7,14 @@ import com.jayway.jsonpath.JsonPath;
 import com.jayway.jsonpath.Option;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.config.Type;
-import eu.dnetlib.pace.model.*;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldListImpl;
+import eu.dnetlib.pace.model.FieldValueImpl;
+import eu.dnetlib.pace.model.MapDocument;
 import net.minidev.json.JSONArray;

-import java.math.BigDecimal;
 import java.util.*;
 import java.util.function.Predicate;
-import java.util.stream.Collectors;

 public class MapDocumentUtil {

@ -44,25 +45,6 @@ public class MapDocumentUtil {
                            .forEach(fi::add);
                    stringField.put(fdef.getName(), fi);
                    break;
-                case DoubleArray:
-                    stringField.put(
-                            fdef.getName(),
-                            new FieldValueImpl(Type.DoubleArray,
-                                    fdef.getName(),
-                                    getJPathArray(fdef.getPath(), json))
-                    );
-                    break;
-                case StringConcat:
-                    String[] jpaths = fdef.getPath().split("\\|\\|\\|");
-                    stringField.put(
-                            fdef.getName(),
-                            new FieldValueImpl(Type.String,
-                                    fdef.getName(),
-                                    truncateValue(Arrays.stream(jpaths).map(jpath -> getJPathString(jpath, json)).collect(Collectors.joining(" ")),
-                                            fdef.getLength())
-                            )
-                    );
-                    break;
            }
        });
        m.setFieldMap(stringField);
@ -121,30 +103,6 @@ public class MapDocumentUtil {
        }
    }

-    public static double[] getJPathArray(final String jsonPath, final String json) {
-        try {
-            Object o = JsonPath.read(json, jsonPath);
-            if (o instanceof double[])
-                return (double[]) o;
-            if (o instanceof JSONArray) {
-                Object[] objects = ((JSONArray) o).toArray();
-                double[] array = new double[objects.length];
-                for (int i = 0; i < objects.length; i++) {
-                    if (objects[i] instanceof BigDecimal)
-                        array[i] = ((BigDecimal)objects[i]).doubleValue();
-                    else
-                        array[i] = (double) objects[i];
-                }
-                return array;
-            }
-            return new double[0];
-        }
-        catch (Exception e) {
-            e.printStackTrace();
-            return new double[0];
-        }
-    }
-

    public static String truncateValue(String value, int length) {
        if (value == null)
--- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv
+++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv
@ -909,6 +909,7 @@ city::2389086;Berberati;BBT;Berberati;Berbérati;Берберати;
 city::2389853;Bangui;BGF;Bangi;Bangis;Bangui;Mpan'nkoui;ban ji;bang-gi;bangi;bangwyy;Μπανγκουί;Банги;Бангі;בנגואי;بانگوئی;ባንጊ;バンギ;班基;방기;
 city::2255414;Pointe-Noire;PNR;Pointe-Noire;Puehnt-Nuar;Puent Nuaras;puaengteunualeu;Пуэнт-Нуар;푸앵트누아르;
 city::2258261;Dolisie;DIS;Dolisi;Dolisie;Dolisje;Dolizi;Dolosie;Loubomo;Lubomo;dolliji;dorishi;duo li xi;dwlysy;Долиси;Лубомо;دولیسی;ドリシー;多利西;돌리지;
+city::2259383;Kayes;Jacob;Kai;Kajes;Kaye;Kayes;Kaï;Кайес;
 city::2260535;Brazzaville;BZV;Braza;Brazavil;Brazavilis;Brazavilo;Brazzavil';Brazzaville;Maya-Maya;Mprazabil;N'Tamo;beulajabil;brazafyl;brazawyl;brzwwyl;bu la chai wei er;burazavu~iru;Μπραζαβίλ;Браззавиль;ברזוויל;برازافيل;برازاویل;ብራዛቪል;ブラザヴィル;布拉柴维尔;브라자빌;
 city::2657896;Zurich;Cirihe;Cirikh;Ciurichas;Cjurikh;Cjurikh khot;Cuerih;Curych;Cürih;Cīrihe;Gorad Cjurykh;Lungsod ng Zuerich;Lungsod ng Zürich;Su-la-sie;Suerix;Syurix;Sürix;Sŭ-là̤-sié;Tsuerix;Tsurique;Tsürix;Turicum;Turitg;ZRH;Zeurich;Zirich;Zirik;Zuerich;Zuerigh;Zuerih;Zuric;Zurich;Zuricu;Zurigh;Zurigo;Zuriko;Zurique;Zurych;Zurìcu;Zyriche;Zyrihu;Zúric;Zúrich;Zürich;Zürigh;Zürih;churihhi;chwilihi;curikku;jhyurika;jurikha;su li shi;su rik;suricc;tsiurikhi;tsyryk;zi'urikha;zwrykh;zyryk;zyurikha;zywrch;zywrh;zywrkh;Ζυρίχη;Горад Цюрых;Цирих;Цюрих;Цюрих хот;Ցյուրիխ;ציריך;زوريخ;زوریخ;زيورخ;زیورخ;زیورچ;سيۇرىخ;څوریخ;ܙܝܘܪܚ;ܬܣܝܪܝܟ;ज़्यूरिख़;झ्युरिक;জুরিখ;ਜ਼ਿਊਰਿਖ;சூரிக்கு;സൂറിച്ച്;ซูริก;ဇူးရစ်ချ်မြို့;ციურიხი;ዙሪክ;チューリッヒ;苏黎世;蘇黎世;취리히;
 city::2657970;Winterthur;Eulachstadt;Gorad Vintehrtur;Vintertour;Vintertur;Vintertura;Vinterturas;Vinterturi;Vinterturo;Vintertūra;Vintertūras;Vitudurum;Winterthour;Winterthur;ZLI;binteotueo;fyntrtwr;vu~intato~uru;wen te tu er;wntrtwr;Βίντερτουρ;Винтертур;Вінтертур;Горад Вінтэртур;فينترتور;ونترتور;ونٹرتھر;ვინტერთური;ヴィンタートゥール;温特图尔;빈터투어;
@ -2993,7 +2994,7 @@ city::262036;Glyfada;Aixone;Glifadha;Glifádha;Glyfada;Glyfáda;Γλυφάδα;
 city::262135;Galatsi;Galatsi;Galatsion;Galátsi;Galátsion;Γαλάτσι;Γαλάτσιον;
 city::263986;Agios Dimitrios;Agios Dimitrios;Ayios Dhimitrios;Brakhami;Brakhámi;Áyios Dhimítrios;Άγιος Δημήτριος;
 city::264194;Agia Paraskevi;Agia Paraskeue;Agia Paraskevi;Agía Paraskeví;Ayia Paraskevi;Ayía Paraskeví;Αγία Παρασκευή;
-city::264371;Athens;athenon;ATH;Afina;Afini;Afiny;An Aithin;Ateena;Atehny;Aten;Atena;Atenai;Atenas;Atenas - Athena;Atenas - Αθήνα;Αθηνών;Atene;Atenes;Ateni;Ateno;Atenoj;Ateny;Athen;Athena;Athenae;Athenai;Athene;Athenes;Athens;Atheny;Athina;Athinai;Athinia;Athènes;Athén;Athénes;Athény;Athína;Athínai;Atina;Atény;Atēnas;Atėnai;Aþena;Kota Athena;Lungsod ng Athina;Lungsod ng Athína;atene;atene si;ateni;athensa;athyna;atn;etens;xethens;ya dian;Αθήνα;Αθήναι;Αθηνα;Αθηναι;Атина;Атэны;Афины;Афіни;Аѳины;Աթենք;אתונה;آتن;أثينا;ئافېنا;ܐܬܝܢܐ;अथेन्स;ஏதென்ஸ்;เอเธนส์;ათენი;Ἀθῆναι;アテネ;雅典;아테네;아테네 시;
+city::264371;Athens;ATH;Afina;Afini;Afiny;An Aithin;Ateena;Atehny;Aten;Atena;Atenai;Atenas;Atenas - Athena;Atenas - Αθήνα;Atene;Atenes;Ateni;Ateno;Atenoj;Ateny;Athen;Athena;Athenae;Athenai;Athene;Athenes;Athens;Atheny;Athina;Athinai;Athinia;Athènes;Athén;Athénes;Athény;Athína;Athínai;Atina;Atény;Atēnas;Atėnai;Aþena;Kota Athena;Lungsod ng Athina;Lungsod ng Athína;atene;atene si;ateni;athensa;athyna;atn;etens;xethens;ya dian;Αθήνα;Αθήναι;Αθηνα;Αθηναι;Атина;Атэны;Афины;Афіни;Аѳины;Աթենք;אתונה;آتن;أثينا;ئافېنا;ܐܬܝܢܐ;अथेन्स;ஏதென்ஸ்;เอเธนส์;ათენი;Ἀθῆναι;アテネ;雅典;아테네;아테네 시;
 city::265243;Marousi;Amarousio;Amarousion;Amaroúsion;Marousi;Maroussi;Maroúsi;Αμαρούσιον;Μαρούσι;
 city::265488;Acharnes;Acharnae;Acharnai;Acharne;Acharnes;Akharnai;Akharnaí;Menidhi;Menidhion;Menidi;Menidion;Menioi;Menídhi;Menídhion;Meníoi;Αχαρνές;Αχαρναί;Μενίδι;Μενίδιον;
 city::265533;Aigaleo;Aegaleo;Aigaleo;Aigáleo;Egaleo;Αιγάλεω;
@ -5209,6 +5210,7 @@ city::2451478;Segou;Segi;Segou;Segu;Segú;Senkou;Ségou;sai gu;segu;sgw;syghw;Σ
 city::2453348;Mopti;MZI;Mopti;Moptis;mo pu ti;mobti;moputi;mwbty;mwpty;Μοπτί;Мопти;Мопті;موبتي;موپتی;موپٹی;モプティ;莫普提;몹티;
 city::2453662;Markala;Markala;
 city::2454268;Koutiala;KTX;Koutiala;Kutiala;ku jia la;Кутиала;庫佳拉;
+city::2455518;Kayes;Gorad Kaes;KYS;Kaes;Kagies;Kajes;Kajesas;Kayes;Kayi;ka yi;kai;kays;keseu;kyz;Καγιές;Горад Каес;Каес;Кайес;Каєс;كايس;کایس;کیز;კაესი;カイ;卡伊;케스;
 city::2457163;Gao;GAQ;Gao;Nkao;gao;gaw;gayw;jaw;jia ao;ka xo;Γκάο;Гао;Ґао;גאו;جاو;گائو;گاو;กาโอ;ガオ;加奥;가오;
 city::2460596;Bamako;BKO;Bamaco - Bamako;Bamakas;Bamako;Bamaku;Bamakó;Bamakɔ;Bammaco;Bammako;Mpamako;ba ma ke;bamako;bamakw;bmqw;Μπαμάκο;Бамако;Բամակո;במקו;باماكو;باماکو;ባማኮ;バマコ;巴馬科;바마코;
 city::1285173;Yenangyaung;Yaynangyoung;Yenangyaung;Yenangyoung;
@ -7472,6 +7474,7 @@ city::4177887;West Palm Beach;Litus Palmense Occidentale;Okcidenta Palm Beach;PB
 city::4178003;Weston;Uehston;Veston;Weston;vestana;wei si dun;wstwn;wstwn  flwryda;Вестон;Уэстон;وستون;وستون، فلوریدا;वेस्टन;韦斯顿;
 city::4179320;Albany;ABY;Albany;City of Opportunity;Olbani;albani;albany;albany  jarjya;albany  jwrjya;ao er ba ni;olbeoni;orubani;Олбани;Олбані;آلبانی، جورجیا;ألباني;البانی، جارجیا;अल्बानी;オールバニ;奧爾巴尼;올버니;
 city::4179574;Alpharetta;Al'faretta;Alfareta;Alpharetta;New Prospect Campground;alfarta  jwrjya;alfaryta;alfaryta  jarjya;alphareta;Алфарета;Альфаретта;آلفارتا، جورجیا;ألفاريتا;الفاریتا، جارجیا;अल्फारेटा;
+city::4180386;Athens;AHN;Atens;Atensas;Athens;Athens i Georgia;Athens-Clarke County;Atina;Atuns;Cedar Shoals;aeseonseu;asenzu;athensa;athyna;atn  jwrjya;atynz  jwrjya;ethensaklarka ka'unti;ya dian;Атенс;Атина;Атънс;אתנס;آتئنز، جورجیا;آتن، جورجیا;أثينا;ایتھنز، جارجیا;अथेन्स;एथेन्सक्लार्क काउन्टी;アセンズ;雅典;애선스;
 city::4180439;Atlanta;ATL;Atlant;Atlanta;Atlantae;Atlonta;Canebrake;Gorad Atlanta;Marthasdale;Marthasville;Standing Peachtree;Terminus;White Hall;Whitehall;aeteullaenta;arr‌lanra nagaram;atalanta;ateullaenta;ateullanta;atlanta;atoranta;atʼlantʼa;etalanta;etlanta;ya te lan da;Ατλάντα;Атлантæ;Атланта;Горад Атланта;Ատլանտա;אטלאנטא;אטלנטה;آتلانتا;أتلانتا;ئەتڵانتا;اٹلانٹا;اٹلانٹا، جارجیا;अटलांटा;अटलान्टा;एट्लान्टा;एत्लान्ता;আটলান্টা;એટલાન્ટા;அட்லான்டா;అట్లాంటా;ಅಟ್ಲಾಂಟಾ;അറ്റ്‌ലാന്റാ നഗരം;แอตแลนตา;ཨ་ཊི་ལཱན་ཊཱ།;အတ္တလန္တာမြို့;ატლანტა;አትላንታ;アトランタ;亚特兰大;亞特蘭大;아틀란타;아틀랜타;애틀랜타;
 city::4184530;Brookhaven;Brookhaven;Brookhaven Heights;Nort Atlanta;North Atlanta;brwk hawn  jwrjya;brwkhafn;nartha etlanta;Норт Атланта;بروكهافن;بروک هاون، جورجیا;بروک ہیون، جارجیا;नर्थ एट्लान्टा;
 city::4188985;Columbus;CSG;Columbus;Kolambus;Kolumbas;Kolumbus;Kulumbus;ge lun bu;klmbws  jwrjya;kolambasa;kolleombeoseu;kolumbus;koronbasu;kwlmbs  jarjya;kwlwmbws;qwlwmbws;Коламбус;Колумбус;Кълъмбъс;קולומבוס;كولومبوس;کلمبوس، جورجیا;کولمبس، جارجیا;कोलम्बस;コロンバス;哥伦布;콜럼버스;
@ -7950,7 +7953,7 @@ city::5258957;La Crosse;Gateway City;LSE;La Crosse;La Kros;La-Kross;Lac Rosse;La
 city::5261457;Madison;Gorad Madysan;MSN;Madison;Madisonas;Madisonia;Madisons;Madisun;Mantison;Medison;Medisona;Mehdison;madisan;madison;madisoni;madyswn;maediseun;mai di xun;maidisana;mdysn  wyskansyn;mdyswn;medisana;metican;Μάντισον;Горад Мадысан;Мадисон;Мадисън;Медисон;Медісон;Мэдисон;Մեդիսոն;מדיסון;ماديسون;مدیسن، ویسکانسین;میڈیسن;میڈیسون، وسکونسن;माडिसन्;मॅडिसन;मेडिसन;मैडिसन;மேடிசன்;მადისონი;マディソン;麦迪逊;매디슨;
 city::5263045;Milwaukee;Gorad Miluoki;Juneautown;Kilbourntown;MKE;Mahn-a-wau-kee Seepe;Mahn-a-wauk-ee See-pe;Mahn-a-waukee Seepe;Mahn-a-waukie;Mahn-ah-wauk Seepe;Mahnawauk;Man-a-wau-kee;Man-a-wauk-ee;Man-na-wah-kie;Mana'wa;Manawaki;Manawaukee;Manayaukee;Maunahwauke;Mee-lee-waug-ee;Meliki;Melleoki;Melwarik;Meneawkee;Meolaki;Mil-wah-kie;Milgouoki;Milioke;Millewacki;Millicki;Milo-aki;Milouagui;Milouakik;Milowages;Miluoki;Miluokʻi;Milvauchia;Milvoki;Milvokio;Milvokis;Milwacky;Milwahkie;Milwalka;Milwalky;Milwarck;Milwarik;Milwaucki;Milwaukee;Milwaukie;Minewaki;Miniaki;Minnawack;Winnipesaukee;mi er wo ji;mil wxki;mil-woki;mila'oyaki;milavoki;miluokʼi;milvaki;milvakki;milvauki;miruu~oki;mlwaky;mylwaky;mylwaky  wyskansyn;mylwwqy;Μιλγουόκι;Горад Мілуокі;Милвоки;Милуоки;Мілуокі;Միլուոքի;מילוואקי;מילווקי;ملواکی;ميلواكي;میلواکی، ویسکانسین;मिलवॉकी;मिल्वौकी;মিলওয়াকি;மில்வாக்கி;మిల్వాకీ;ಮಿಲ್ವಾಕೀ;มิลวอกี;მილუოკი;ミルウォーキー;密尔沃基;密爾沃基;밀워키;
 city::5264870;North La Crosse;;
-city::5265838;Oshkosh;Algoma;Brooklyn;OSH;Oshkosh;Oskosh;Sawdust City;ashkwsh  wyskansyn;ausakosa;awshkwsh;awshkwsh  wskwnsn;oshukoshu;Ошкош;אושקוש;أوشكوش;اشکوش، ویسکانسین;اوشکوش، وسکونسن;औशकोश;ოშკოში;オシュコシュ;
+city::5265838;Oshkosh;Algoma;Athens;Brooklyn;OSH;Oshkosh;Oskosh;Sawdust City;ashkwsh  wyskansyn;ausakosa;awshkwsh;awshkwsh  wskwnsn;oshukoshu;Ошкош;אושקוש;أوشكوش;اشکوش، ویسکانسین;اوشکوش، وسکونسن;औशकोश;ოშკოში;オシュコシュ;
 city::5268249;Racine;Kipikawi;Port Gilbert;RAC;Racine;Rasijn;Rasin;Rasinas;la xin;leosin;rashin;rasini;rasyn;rasyn  wskwnsn;rysyn  wyskansyn;Расийн;Расин;Расін;ראסין;راسين;راسین، وسکونسن;ریسین، ویسکانسین;რასინი;ラシーン;拉辛;러신;
 city::5278052;Waukesha;Prairieville;UES;Uokesho;Uokisha;Vokesha;Vokisha;Waukesha;u~okisho;wakysha  wyskansyn;wawkysha  wskwnsn;wkysha;Вокеша;Вокиша;Уокешо;Уокиша;واوکیشا، وسکونسن;واکیشا، ویسکانسین;وكيشا;უოკეშო;ウォキショー;
 city::5278420;West Allis;Vest Alis;alys ghrby  wyskansyn;wyst alys;Вест Алис;آلیس غربی، ویسکانسین;ويست أليس;ویسٹ الیس، وسکونسن;უესტ-ალისი;
--- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_gr.txt
+++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_gr.txt
@ -1,847 +0,0 @@
-ένα
-έναν
-ένας
-αι
-ακομα
-ακομη
-ακριβως
-αληθεια
-αληθινα
-αλλα
-αλλαχου
-αλλες
-αλλη
-αλλην
-αλλης
-αλλιως
-αλλιωτικα
-αλλο
-αλλοι
-αλλοιως
-αλλοιωτικα
-αλλον
-αλλος
-αλλοτε
-αλλου
-αλλους
-αλλων
-αμα
-αμεσα
-αμεσως
-αν
-ανα
-αναμεσα
-αναμεταξυ
-ανευ
-αντι
-αντιπερα
-αντις
-ανω
-ανωτερω
-αξαφνα
-απ
-απεναντι
-απο
-αποψε
-από
-αρα
-αραγε
-αργα
-αργοτερο
-αριστερα
-αρκετα
-αρχικα
-ας
-αυριο
-αυτα
-αυτες
-αυτεσ
-αυτη
-αυτην
-αυτης
-αυτο
-αυτοι
-αυτον
-αυτος
-αυτοσ
-αυτου
-αυτους
-αυτουσ
-αυτων
-αφοτου
-αφου
-αἱ
-αἳ
-αἵ
-αὐτόσ
-αὐτὸς
-αὖ
-α∆ιακοπα
-βεβαια
-βεβαιοτατα
-γάρ
-γα
-γα^
-γε
-γι
-για
-γοῦν
-γρηγορα
-γυρω
-γὰρ
-δ'
-δέ
-δή
-δαί
-δαίσ
-δαὶ
-δαὶς
-δε
-δεν
-δι
-δι'
-διά
-δια
-διὰ
-δὲ
-δὴ
-δ’
-εαν
-εαυτο
-εαυτον
-εαυτου
-εαυτους
-εαυτων
-εγκαιρα
-εγκαιρως
-εγω
-ειθε
-ειμαι
-ειμαστε
-ειναι
-εις
-εισαι
-εισαστε
-ειστε
-ειτε
-ειχα
-ειχαμε
-ειχαν
-ειχατε
-ειχε
-ειχες
-ει∆εμη
-εκ
-εκαστα
-εκαστες
-εκαστη
-εκαστην
-εκαστης
-εκαστο
-εκαστοι
-εκαστον
-εκαστος
-εκαστου
-εκαστους
-εκαστων
-εκει
-εκεινα
-εκεινες
-εκεινεσ
-εκεινη
-εκεινην
-εκεινης
-εκεινο
-εκεινοι
-εκεινον
-εκεινος
-εκεινοσ
-εκεινου
-εκεινους
-εκεινουσ
-εκεινων
-εκτος
-εμας
-εμεις
-εμενα
-εμπρος
-εν
-ενα
-εναν
-ενας
-ενος
-εντελως
-εντος
-εντωμεταξυ
-ενω
-ενός
-εξ
-εξαφνα
-εξης
-εξισου
-εξω
-επ
-επί
-επανω
-επειτα
-επει∆η
-επι
-επισης
-επομενως
-εσας
-εσεις
-εσενα
-εστω
-εσυ
-ετερα
-ετεραι
-ετερας
-ετερες
-ετερη
-ετερης
-ετερο
-ετεροι
-ετερον
-ετερος
-ετερου
-ετερους
-ετερων
-ετουτα
-ετουτες
-ετουτη
-ετουτην
-ετουτης
-ετουτο
-ετουτοι
-ετουτον
-ετουτος
-ετουτου
-ετουτους
-ετουτων
-ετσι
-ευγε
-ευθυς
-ευτυχως
-εφεξης
-εχει
-εχεις
-εχετε
-εχθες
-εχομε
-εχουμε
-εχουν
-εχτες
-εχω
-εως
-εἰ
-εἰμί
-εἰμὶ
-εἰς
-εἰσ
-εἴ
-εἴμι
-εἴτε
-ε∆ω
-η
-ημασταν
-ημαστε
-ημουν
-ησασταν
-ησαστε
-ησουν
-ηταν
-ητανε
-ητοι
-ηττον
-η∆η
-θα
-ι
-ιι
-ιιι
-ισαμε
-ισια
-ισως
-ισωσ
-ι∆ια
-ι∆ιαν
-ι∆ιας
-ι∆ιες
-ι∆ιο
-ι∆ιοι
-ι∆ιον
-ι∆ιος
-ι∆ιου
-ι∆ιους
-ι∆ιων
-ι∆ιως
-κ
-καί
-καίτοι
-καθ
-καθε
-καθεμια
-καθεμιας
-καθενα
-καθενας
-καθενος
-καθετι
-καθολου
-καθως
-και
-κακα
-κακως
-καλα
-καλως
-καμια
-καμιαν
-καμιας
-καμποσα
-καμποσες
-καμποση
-καμποσην
-καμποσης
-καμποσο
-καμποσοι
-καμποσον
-καμποσος
-καμποσου
-καμποσους
-καμποσων
-κανεις
-κανεν
-κανενα
-κανεναν
-κανενας
-κανενος
-καποια
-καποιαν
-καποιας
-καποιες
-καποιο
-καποιοι
-καποιον
-καποιος
-καποιου
-καποιους
-καποιων
-καποτε
-καπου
-καπως
-κατ
-κατά
-κατα
-κατι
-κατιτι
-κατοπιν
-κατω
-κατὰ
-καὶ
-κι
-κιολας
-κλπ
-κοντα
-κτλ
-κυριως
-κἀν
-κἂν
-λιγακι
-λιγο
-λιγωτερο
-λογω
-λοιπα
-λοιπον
-μέν
-μέσα
-μή
-μήτε
-μία
-μα
-μαζι
-μακαρι
-μακρυα
-μαλιστα
-μαλλον
-μας
-με
-μεθ
-μεθαυριο
-μειον
-μελει
-μελλεται
-μεμιας
-μεν
-μερικα
-μερικες
-μερικοι
-μερικους
-μερικων
-μεσα
-μετ
-μετά
-μετα
-μεταξυ
-μετὰ
-μεχρι
-μη
-μην
-μηπως
-μητε
-μη∆ε
-μιά
-μια
-μιαν
-μιας
-μολις
-μολονοτι
-μοναχα
-μονες
-μονη
-μονην
-μονης
-μονο
-μονοι
-μονομιας
-μονος
-μονου
-μονους
-μονων
-μου
-μπορει
-μπορουν
-μπραβο
-μπρος
-μἐν
-μὲν
-μὴ
-μὴν
-να
-ναι
-νωρις
-ξανα
-ξαφνικα
-ο
-οι
-ολα
-ολες
-ολη
-ολην
-ολης
-ολο
-ολογυρα
-ολοι
-ολον
-ολονεν
-ολος
-ολοτελα
-ολου
-ολους
-ολων
-ολως
-ολως∆ιολου
-ομως
-ομωσ
-οποια
-οποιαν
-οποιαν∆ηποτε
-οποιας
-οποιας∆ηποτε
-οποια∆ηποτε
-οποιες
-οποιες∆ηποτε
-οποιο
-οποιοι
-οποιον
-οποιον∆ηποτε
-οποιος
-οποιος∆ηποτε
-οποιου
-οποιους
-οποιους∆ηποτε
-οποιου∆ηποτε
-οποιο∆ηποτε
-οποιων
-οποιων∆ηποτε
-οποι∆ηποτε
-οποτε
-οποτε∆ηποτε
-οπου
-οπου∆ηποτε
-οπως
-οπωσ
-ορισμενα
-ορισμενες
-ορισμενων
-ορισμενως
-οσα
-οσα∆ηποτε
-οσες
-οσες∆ηποτε
-οση
-οσην
-οσην∆ηποτε
-οσης
-οσης∆ηποτε
-οση∆ηποτε
-οσο
-οσοι
-οσοι∆ηποτε
-οσον
-οσον∆ηποτε
-οσος
-οσος∆ηποτε
-οσου
-οσους
-οσους∆ηποτε
-οσου∆ηποτε
-οσο∆ηποτε
-οσων
-οσων∆ηποτε
-οταν
-οτι
-οτι∆ηποτε
-οτου
-ου
-ουτε
-ου∆ε
-οχι
-οἱ
-οἳ
-οἷς
-οὐ
-οὐδ
-οὐδέ
-οὐδείσ
-οὐδεὶς
-οὐδὲ
-οὐδὲν
-οὐκ
-οὐχ
-οὐχὶ
-οὓς
-οὔτε
-οὕτω
-οὕτως
-οὕτωσ
-οὖν
-οὗ
-οὗτος
-οὗτοσ
-παλι
-παντοτε
-παντου
-παντως
-παρ
-παρά
-παρα
-παρὰ
-περί
-περα
-περι
-περιπου
-περισσοτερο
-περσι
-περυσι
-περὶ
-πια
-πιθανον
-πιο
-πισω
-πλαι
-πλεον
-πλην
-ποια
-ποιαν
-ποιας
-ποιες
-ποιεσ
-ποιο
-ποιοι
-ποιον
-ποιος
-ποιοσ
-ποιου
-ποιους
-ποιουσ
-ποιων
-πολυ
-ποσες
-ποση
-ποσην
-ποσης
-ποσοι
-ποσος
-ποσους
-ποτε
-που
-πουθε
-πουθενα
-ποῦ
-πρεπει
-πριν
-προ
-προκειμενου
-προκειται
-προπερσι
-προς
-προσ
-προτου
-προχθες
-προχτες
-πρωτυτερα
-πρόσ
-πρὸ
-πρὸς
-πως
-πωσ
-σαν
-σας
-σε
-σεις
-σημερα
-σιγα
-σου
-στα
-στη
-στην
-στης
-στις
-στο
-στον
-στου
-στους
-στων
-συγχρονως
-συν
-συναμα
-συνεπως
-συνηθως
-συχνα
-συχνας
-συχνες
-συχνη
-συχνην
-συχνης
-συχνο
-συχνοι
-συχνον
-συχνος
-συχνου
-συχνους
-συχνων
-συχνως
-σχε∆ον
-σωστα
-σόσ
-σύ
-σύν
-σὸς
-σὺ
-σὺν
-τά
-τήν
-τί
-τίς
-τίσ
-τα
-ταυτα
-ταυτες
-ταυτη
-ταυτην
-ταυτης
-ταυτο,ταυτον
-ταυτος
-ταυτου
-ταυτων
-ταχα
-ταχατε
-ταῖς
-τα∆ε
-τε
-τελικα
-τελικως
-τες
-τετοια
-τετοιαν
-τετοιας
-τετοιες
-τετοιο
-τετοιοι
-τετοιον
-τετοιος
-τετοιου
-τετοιους
-τετοιων
-τη
-την
-της
-τησ
-τι
-τινα
-τιποτα
-τιποτε
-τις
-τισ
-το
-τοί
-τοι
-τοιοῦτος
-τοιοῦτοσ
-τον
-τος
-τοσα
-τοσες
-τοση
-τοσην
-τοσης
-τοσο
-τοσοι
-τοσον
-τοσος
-τοσου
-τοσους
-τοσων
-τοτε
-του
-τουλαχιστο
-τουλαχιστον
-τους
-τουτα
-τουτες
-τουτη
-τουτην
-τουτης
-τουτο
-τουτοι
-τουτοις
-τουτον
-τουτος
-τουτου
-τουτους
-τουτων
-τούσ
-τοὺς
-τοῖς
-τοῦ
-τυχον
-των
-τωρα
-τό
-τόν
-τότε
-τὰ
-τὰς
-τὴν
-τὸ
-τὸν
-τῆς
-τῆσ
-τῇ
-τῶν
-τῷ
-υπ
-υπερ
-υπο
-υποψη
-υποψιν
-υπό
-υστερα
-φετος
-χαμηλα
-χθες
-χτες
-χωρις
-χωριστα
-ψηλα
-ω
-ωραια
-ως
-ωσ
-ωσαν
-ωσοτου
-ωσπου
-ωστε
-ωστοσο
-ωχ
-ἀλλ'
-ἀλλά
-ἀλλὰ
-ἀλλ’
-ἀπ
-ἀπό
-ἀπὸ
-ἀφ
-ἂν
-ἃ
-ἄλλος
-ἄλλοσ
-ἄν
-ἄρα
-ἅμα
-ἐάν
-ἐγώ
-ἐγὼ
-ἐκ
-ἐμόσ
-ἐμὸς
-ἐν
-ἐξ
-ἐπί
-ἐπεὶ
-ἐπὶ
-ἐστι
-ἐφ
-ἐὰν
-ἑαυτοῦ
-ἔτι
-ἡ
-ἢ
-ἣ
-ἤ
-ἥ
-ἧς
-ἵνα
-ὁ
-ὃ
-ὃν
-ὃς
-ὅ
-ὅδε
-ὅθεν
-ὅπερ
-ὅς
-ὅσ
-ὅστις
-ὅστισ
-ὅτε
-ὅτι
-ὑμόσ
-ὑπ
-ὑπέρ
-ὑπό
-ὑπὲρ
-ὑπὸ
-ὡς
-ὡσ
-ὥς
-ὥστε
-ὦ
-ᾧ
-∆α
-∆ε
-∆εινα
-∆εν
-∆εξια
-∆ηθεν
-∆ηλα∆η
-∆ι
-∆ια
-∆ιαρκως
-∆ικα
-∆ικο
-∆ικοι
-∆ικος
-∆ικου
-∆ικους
-∆ιολου
-∆ιπλα
-∆ιχως
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java
@ -9,7 +9,6 @@ import org.apache.commons.io.IOUtils;

 import java.io.IOException;
 import java.io.StringWriter;
-import java.nio.charset.StandardCharsets;
 import java.util.List;
 import java.util.stream.Collectors;

@ -18,7 +17,7 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions {
 	protected String readFromClasspath(final String filename) {
 		final StringWriter sw = new StringWriter();
 		try {
-			IOUtils.copy(getClass().getResourceAsStream(filename), sw, StandardCharsets.UTF_8);
+			IOUtils.copy(getClass().getResourceAsStream(filename), sw);
 			return sw.toString();
 		} catch (final IOException e) {
 			throw new RuntimeException("cannot load resource from classpath: " + filename);
@ -37,10 +36,6 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions {
 		return new FieldValueImpl(Type.URL, "url", s);
 	}

-	protected Field array(final double[] a) {
-		return new FieldValueImpl(Type.DoubleArray, "array", a);
-	}
-
 	protected Field createFieldList(List<String> strings, String fieldName){

 		List<FieldValueImpl> fieldValueStream = strings.stream().map(s -> new FieldValueImpl(Type.String, fieldName, s)).collect(Collectors.toList());
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
@ -2,15 +2,12 @@ package eu.dnetlib.pace.clustering;

 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
-import com.google.common.collect.Sets;
 import eu.dnetlib.pace.AbstractPaceTest;
 import eu.dnetlib.pace.common.AbstractPaceFunctions;
 import eu.dnetlib.pace.config.DedupConfig;
 import org.junit.jupiter.api.*;

 import java.util.Map;
-import java.util.Set;
-import java.util.stream.Collectors;

 public class ClusteringFunctionTest extends AbstractPaceTest {

@ -103,11 +100,6 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
 		final String s = "Search for the Standard Model Higgs Boson";
 		System.out.println(s);
 		System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
-
-		params.put("len", 3);
-		params.put("max", 1);
-
-		System.out.println(sp.apply(conf, Lists.newArrayList(title("Framework for general-purpose deduplication"))));
 	}

 	@Test
@ -153,10 +145,6 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
 		System.out.println(s);
 		System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));

-		s = "niivue/niivue: 0.21.1";
-		System.out.println(s);
-		System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
-
 	}

 	@Test
@ -199,51 +187,5 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
 		System.out.println("s5 = " + s5);
 		System.out.println(cf.apply(conf, Lists.newArrayList(title(s5))));

-		final String s6 = "National and Kapodistrian University of Athens";
-		System.out.println("s6 = " + s6);
-		System.out.println(cf.apply(conf, Lists.newArrayList(title(s6))));
-
-		final String s7 = "Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών";
-		System.out.println("s7 = " + s7);
-		System.out.println(cf.apply(conf, Lists.newArrayList(title(s7))));
-
 	}
-
-	@Test
-	public void testPersonClustering(){
-
-		final ClusteringFunction cf = new PersonClustering(params);
-		final String s = "Abd-Alla, Abo-el-nour N.";
-		System.out.println("s = " + s);
-		System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
-
-		final String s1 = "Manghi, Paolo";
-		System.out.println("s1 = " + s1);
-		System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
-
-	}
-
-	@Test
-	public void testPersonHash(){
-
-		final ClusteringFunction cf = new PersonHash(params);
-		final String s = "Manghi, Paolo";
-		System.out.println("s = " + s);
-		System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
-
-		final String s1 = "Manghi, P.";
-		System.out.println("s = " + s1);
-		System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
-
-	}
-
-	@Test
-	public void testLastNameFirstInitial(){
-
-		final ClusteringFunction cf = new LastNameFirstInitial(params);
-		final String s = "LI Yonghong";
-		System.out.println("s = " + s);
-		System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
-	}
-
-}
+}
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
@ -2,16 +2,13 @@ package eu.dnetlib.pace.comparators;

 import eu.dnetlib.pace.AbstractPaceTest;
 import eu.dnetlib.pace.clustering.NGramUtils;
-import eu.dnetlib.pace.config.Type;
 import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldValueImpl;
 import eu.dnetlib.pace.tree.*;
 import eu.dnetlib.pace.config.DedupConfig;

 import org.junit.jupiter.api.*;
 import static org.junit.jupiter.api.Assertions.assertEquals;

-import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
@ -24,20 +21,15 @@ public class ComparatorTest extends AbstractPaceTest {

 	@BeforeAll
 	public void setup() {
-		conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
-	}
-
-	@BeforeEach
-	public void beforeEachTest() {
 		params = new HashMap<>();
 		params.put("weight", "1.0");
 		params.put("surname_th", "0.99");
 		params.put("name_th", "0.95");
 		params.put("jpath_value", "$.value");
 		params.put("jpath_classid", "$.qualifier.classid");
+		conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
 	}

-
 	@Test
 	public void testCleanForSorting() {
 		NGramUtils utils = new NGramUtils();
@ -64,10 +56,7 @@ public class ComparatorTest extends AbstractPaceTest {
 		//particular cases
 		assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
 		assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf));
-
-		// failing becasuse 'Allen' is a transliterrated greek stopword
-		// assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
-		assertEquals(-1.0, cityMatch.distance("Washington (United States)", "United States Military Academy", conf));
+		assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
 	}

 	@Test
@ -81,7 +70,7 @@ public class ComparatorTest extends AbstractPaceTest {
 		assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
 		assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
 		assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
-		assertEquals(2.0/3.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
+		assertEquals(0.5, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
 		assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf));
 		assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf));
 		assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
@ -115,7 +104,7 @@ public class ComparatorTest extends AbstractPaceTest {
 	public void stringContainsMatchTest(){

 		params.put("string", "openorgs");
-		params.put("aggregator", "XOR");
+		params.put("bool", "XOR");
 		params.put("caseSensitive", "false");

 		StringContainsMatch stringContainsMatch = new StringContainsMatch(params);
@ -123,7 +112,7 @@ public class ComparatorTest extends AbstractPaceTest {
 		assertEquals(0.0, stringContainsMatch.distance("openorgs", "openorgs", conf));

 		params.put("string", "openorgs");
-		params.put("aggregator", "AND");
+		params.put("bool", "AND");
 		params.put("caseSensitive", "false");

 		stringContainsMatch = new StringContainsMatch(params);
@ -257,10 +246,6 @@ public class ComparatorTest extends AbstractPaceTest {

 		assertEquals(0.25, result);

-		Field f = createFieldList(new ArrayList<>(), "authors");
-		result = authorsMatch.compare(f,f, conf);
-		System.out.println("result = " + result);
-
 	}

 	@Test
@ -282,30 +267,5 @@ public class ComparatorTest extends AbstractPaceTest {
 		assertEquals(1.0, result);
 	}

-	@Test
-	public void domainExactMatch() {
-
-		DomainExactMatch domainExactMatch = new DomainExactMatch(params);
-		Field a = url("http://www.flowrepository.org");
-		Field b = url("http://flowrepository.org/");
-
-		double compare = domainExactMatch.compare(a, b, conf);
-		System.out.println("compare = " + compare);
-
-	}
-
-	@Test
-	public void cosineSimilarity() {
-
-		CosineSimilarity cosineSimilarity = new CosineSimilarity(params);
-
-		Field a = new FieldValueImpl(Type.DoubleArray, "array", new double[]{1,2,3});
-		Field b = new FieldValueImpl(Type.DoubleArray, "array", new double[]{1,2,3});
-
-		double compare = cosineSimilarity.compare(a, b, conf);
-
-		System.out.println("compare = " + compare);
-	}
-

 }
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java
@ -7,7 +7,6 @@ import eu.dnetlib.pace.clustering.ClusteringClass;
 import eu.dnetlib.pace.clustering.ClusteringCombiner;
 import eu.dnetlib.pace.model.Field;
 import eu.dnetlib.pace.model.FieldList;
-import eu.dnetlib.pace.model.FieldValue;
 import eu.dnetlib.pace.model.MapDocument;
 import eu.dnetlib.pace.tree.JsonListMatch;
 import eu.dnetlib.pace.tree.support.AggType;
@ -21,7 +20,10 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;

-import java.util.*;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
 import java.util.stream.Collectors;


@ -83,7 +85,7 @@ public class ConfigTest extends AbstractPaceTest {
 	}

 	@Test
-	public void asMapDocumentTest1() {
+	public void asMapDocumentTest() {

 		DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));

@ -101,19 +103,6 @@ public class ConfigTest extends AbstractPaceTest {

    }

-	@Test
-	public void authorAsMapDocument() {
-
-		DedupConfig dedupConf = DedupConfig.load(readFromClasspath("author.fdup.conf.json"));
-
-		final String json = readFromClasspath("author.json");
-
-		final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
-
-		System.out.println("mapDocument = " + Arrays.toString(((FieldValue) mapDocument.getFieldMap().get("topics")).doubleArrayValue()));
-
-	}
-
    @Test
    public  void testJPath()  {
        final String json = readFromClasspath("organization.json");
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
@ -1,6 +1,7 @@
 package eu.dnetlib.pace.util;

 import eu.dnetlib.pace.model.Person;
+import jdk.nashorn.internal.ir.annotations.Ignore;
 import org.junit.jupiter.api.*;

 import java.util.HashMap;
@ -17,6 +18,7 @@ public class UtilTest {
    }

    @Test
+    @Ignore
    public void paceResolverTest() {
        PaceResolver paceResolver = new PaceResolver();
        paceResolver.getComparator("keywordMatch", params);
@ -28,11 +30,6 @@ public class UtilTest {

        assertEquals("kennedy", p.getSurnameString());
        assertEquals("j f", p.getNameString());
-
-        p = new Person("Guan-Hua Du", false);
-
-        System.out.println("surname = " + p.getSurnameString());
-        System.out.println("name = " + p.getNameString());
    }

 }
--- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.fdup.conf.json
+++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.fdup.conf.json
@ -1,134 +0,0 @@
-{
-  "wf": {
-    "threshold": "0.99",
-    "dedupRun": "001",
-    "entityType": "author",
-    "subEntityType": "author",
-    "subEntityValue": "author",
-    "orderField": "fullname",
-    "queueMaxSize": "200",
-    "groupMaxSize": "100",
-    "maxChildren": "100",
-    "slidingWindowSize": "50",
-    "rootBuilder": [
-      "result",
-      "resultProject_outcome_isProducedBy",
-      "resultResult_publicationDataset_isRelatedTo",
-      "resultResult_similarity_isAmongTopNSimilarDocuments",
-      "resultResult_similarity_hasAmongTopNSimilarDocuments",
-      "resultOrganization_affiliation_isAffiliatedWith",
-      "resultResult_part_hasPart",
-      "resultResult_part_isPartOf",
-      "resultResult_supplement_isSupplementTo",
-      "resultResult_supplement_isSupplementedBy",
-      "resultResult_version_isVersionOf"
-    ],
-    "includeChildren": "true",
-    "maxIterations": 20,
-    "idPath": "$.id"
-  },
-  "pace": {
-    "clustering" : [
-      { "name" : "lnfi", "fields" : [ "fullname" ], "params" : {} }
-    ],
-    "decisionTree": {
-      "start": {
-        "fields": [
-          {
-            "field": "orcid",
-            "comparator": "exactMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "orcids",
-        "ignoreUndefined": "true"
-      },
-      "orcids": {
-        "fields": [
-          {
-            "field": "orcids",
-            "comparator": "stringListMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {"type": "count"}
-          }
-        ],
-        "threshold": 3.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "coauthors",
-        "undefined": "coauthors",
-        "ignoreUndefined": "true"
-      },
-      "coauthors": {
-        "fields": [
-          {
-            "field": "coauthors",
-            "comparator": "authorsMatch",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {"type": "count"}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "topicsMatch",
-        "negative": "NO_MATCH",
-        "undefined": "topicsMatch",
-        "ignoreUndefined": "true"
-      },
-      "topicsMatch": {
-        "fields": [
-          {
-            "field": "topics",
-            "comparator": "cosineSimilarity",
-            "weight": 1.0,
-            "countIfUndefined": "true",
-            "params": {}
-          }
-        ],
-        "threshold": 1.0,
-        "aggregation": "MAX",
-        "positive": "MATCH",
-        "negative": "NO_MATCH",
-        "undefined": "NO_MATCH",
-        "ignoreUndefined": "false"
-      }
-    },
-    "model": [
-      {
-        "name": "topics",
-        "type": "DoubleArray",
-        "path": "$.topics"
-      },
-      {
-        "name": "fullname",
-        "type": "String",
-        "path": "$.fullname"
-      },
-      {
-        "name": "orcid",
-        "type": "String",
-        "path": "$.orcid"
-      },
-      {
-        "name": "coauthors",
-        "type": "List",
-        "path": "$.coAuthors[*].fullname"
-      },
-      {
-        "name": "orcids",
-        "type": "List",
-        "path": "$.coAuthors[*].orcid"
-      }
-    ],
-    "blacklists": {},
-    "synonyms": {}
-  }
-}
--- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.json
+++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.json
@ -1 +0,0 @@
-{"fullname":"Zaragoza, Maria Cleofé","firstname":"Maria Cleofé","lastname":"Zaragoza","coAuthors":[{"fullname":"Cambras, Trinitat","lastname":"Cambras","firstname":"Trinitat","orcid":"0000-0002-9009-4690"},{"fullname":"Castro-Marrero, Jesús","lastname":"Castro-Marrero","firstname":"Jesús","orcid":""},{"fullname":"Díez-Noguera, Antoni","lastname":"Díez-Noguera","firstname":"Antoni","orcid":""},{"fullname":"Alegre, José","lastname":"Alegre","firstname":"José","orcid":"0000-0002-7582-7585"}],"topics":[0.9522090839562252,0.04779091604377485],"orcid":"0000-0002-9797-0219","id":"author::1a10826c83c7f9f0dcebe7df05e37a2a","pubId":"50|pmid________::db7fd19db5a620eafad40cfb97f9690d"}
--- a/pom.xml
+++ b/pom.xml
@ -5,7 +5,7 @@

    <groupId>eu.dnetlib</groupId>
    <artifactId>dnet-dedup</artifactId>
-    <version>4.1.13-SNAPSHOT</version>
+    <version>4.1.9</version>

    <packaging>pom</packaging>

@ -22,7 +22,7 @@

    <scm>
        <developerConnection>scm:git:https://code-repo.d4science.org/D-Net/dnet-dedup.git</developerConnection>
-        <tag>dnet-dedup-4.0.3</tag>
+        <tag>dnet-dedup-4.1.9</tag>
    </scm>

    <modules>
@ -144,7 +144,14 @@
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-surefire-plugin</artifactId>
-                    <version>2.22.0</version>
+                    <version>2.19.1</version>
+                    <dependencies>
+                        <dependency>
+                            <groupId>org.junit.jupiter</groupId>
+                            <artifactId>junit-jupiter</artifactId>
+                            <version>${junit-jupiter.version}</version>
+                        </dependency>
+                    </dependencies>
                    <configuration>
                        <redirectTestOutputToFile>false</redirectTestOutputToFile>
                    </configuration>
@ -254,7 +261,7 @@
        <oozie.use.system.libpath>true</oozie.use.system.libpath>
        <properties.maven.plugin.version>2.0.1</properties.maven.plugin.version>
        <junit-jupiter.version>5.6.1</junit-jupiter.version>
-        <maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>../dhp-build/dhp-build-assembly-resources/target/dhp-build-assembly-resources-${project.version}.jar</maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>
+        <maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>../dhp-build/dhp-build-assembly-resources/target/dhp-build-assembly-resources-4.1.8-SNAPSHOT.jar</maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>

    </properties>

@ -404,11 +411,20 @@
            </dependency>

            <dependency>
-                <groupId>com.ibm.icu</groupId>
-                <artifactId>icu4j</artifactId>
-                <version>70.1</version>
+                <groupId>org.mockito</groupId>
+                <artifactId>mockito-core</artifactId>
+                <version>3.3.3</version>
+                <scope>test</scope>
            </dependency>

+            <dependency>
+                <groupId>org.mockito</groupId>
+                <artifactId>mockito-junit-jupiter</artifactId>
+                <version>3.3.3</version>
+                <scope>test</scope>
+            </dependency>
+
+
        </dependencies>

    </dependencyManagement>
				`@ -1 +0,0 @@`
				{"fullname":"Zaragoza, Maria Cleofé","firstname":"Maria Cleofé","lastname":"Zaragoza","coAuthors":[{"fullname":"Cambras, Trinitat","lastname":"Cambras","firstname":"Trinitat","orcid":"0000-0002-9009-4690"},{"fullname":"Castro-Marrero, Jesús","lastname":"Castro-Marrero","firstname":"Jesús","orcid":""},{"fullname":"Díez-Noguera, Antoni","lastname":"Díez-Noguera","firstname":"Antoni","orcid":""},{"fullname":"Alegre, José","lastname":"Alegre","firstname":"José","orcid":"0000-0002-7582-7585"}],"topics":[0.9522090839562252,0.04779091604377485],"orcid":"0000-0002-9797-0219","id":"author::1a10826c83c7f9f0dcebe7df05e37a2a","pubId":"50\|pmid________::db7fd19db5a620eafad40cfb97f9690d"}