merged from master

2020-01-17 14:25:57 +01:00 · 2020-01-17 14:25:57 +01:00 · 1cd6899480
parent 63c0db4ff8 749b0660ab
commit 1cd6899480
55 changed files with 3285 additions and 74 deletions
--- a/.gitignore
+++ b/.gitignore
@ -18,4 +18,5 @@
 /*/build
 /build
 spark-warehouse
-/dhp-workflows/dhp-graph-mapper/job-override.properties
+/*/*/job-override.properties
+
--- a/dhp-build/dhp-build-assembly-resources/pom.xml
+++ b/dhp-build/dhp-build-assembly-resources/pom.xml
@ -17,6 +17,7 @@
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
+                <version>${maven.compiler.plugin.version}</version>
            </plugin>
        </plugins>
    </build>
--- a/dhp-build/dhp-build-properties-maven-plugin/pom.xml
+++ b/dhp-build/dhp-build-properties-maven-plugin/pom.xml
@ -53,6 +53,7 @@
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
+                <version>${maven.compiler.plugin.version}</version>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -17,6 +17,10 @@
 			<groupId>commons-cli</groupId>
 			<artifactId>commons-cli</artifactId>
 		</dependency>
+		<dependency>
+			<groupId>commons-io</groupId>
+			<artifactId>commons-io</artifactId>
+		</dependency>
 		<dependency>
 			<groupId>org.apache.commons</groupId>
 			<artifactId>commons-lang3</artifactId>
@ -29,21 +33,15 @@
 			<groupId>javax.persistence</groupId>
 			<artifactId>javax.persistence-api</artifactId>
 		</dependency>
-
 		<dependency>
 			<groupId>com.fasterxml.jackson.core</groupId>
 			<artifactId>jackson-databind</artifactId>
 		</dependency>
-
 		<!-- https://mvnrepository.com/artifact/com.rabbitmq/amqp-client -->
 		<dependency>
 			<groupId>com.rabbitmq</groupId>
 			<artifactId>amqp-client</artifactId>
 		</dependency>
-		<dependency>
-			<groupId>commons-io</groupId>
-			<artifactId>commons-io</artifactId>
-		</dependency>
 	</dependencies>

 </project>
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java
@ -2,17 +2,25 @@ package eu.dnetlib.dhp.application;

 import com.fasterxml.jackson.databind.ObjectMapper;
 import org.apache.commons.cli.*;
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.io.IOUtils;

+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
 import java.io.Serializable;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
+import java.io.StringWriter;
+import java.util.*;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+import java.util.zip.Inflater;

 public class ArgumentApplicationParser implements Serializable {

    private final Options options = new Options();
    private final Map<String, String> objectMap = new HashMap<>();

+    private final List<String> compressedValues = new ArrayList<>();
+
    public ArgumentApplicationParser(final String json_configuration) throws Exception {
        final ObjectMapper mapper = new ObjectMapper();
        final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
@ -29,6 +37,9 @@ public class ArgumentApplicationParser implements Serializable {
            final Option o = new Option(conf.getParamName(), true, conf.getParamDescription());
            o.setLongOpt(conf.getParamLongName());
            o.setRequired(conf.isParamRequired());
+            if (conf.isCompressed()) {
+                compressedValues.add(conf.getParamLongName());
+            }
            return o;
        }).forEach(options::addOption);

@ -38,10 +49,32 @@ public class ArgumentApplicationParser implements Serializable {

    }

+
+    public static String decompressValue(final String abstractCompressed) {
+        try {
+            byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
+            GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray));
+            final StringWriter stringWriter = new StringWriter();
+            IOUtils.copy(gis, stringWriter);
+            return stringWriter.toString();
+        } catch (Throwable e) {
+            System.out.println("Wrong value to decompress:" + abstractCompressed);
+            throw new RuntimeException(e);
+        }
+    }
+
+    public static String compressArgument(final String value)  throws Exception{
+        ByteArrayOutputStream out = new ByteArrayOutputStream();
+        GZIPOutputStream gzip = new GZIPOutputStream(out);
+        gzip.write(value.getBytes());
+        gzip.close();
+        return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
+    }
+
    public void parseArgument(final String[] args) throws Exception {
        CommandLineParser parser = new BasicParser();
        CommandLine cmd = parser.parse(options, args);
-        Arrays.stream(cmd.getOptions()).forEach(it -> objectMap.put(it.getLongOpt(), it.getValue()));
+        Arrays.stream(cmd.getOptions()).forEach(it -> objectMap.put(it.getLongOpt(), compressedValues.contains(it.getLongOpt())? decompressValue(it.getValue()): it.getValue()));
    }

    public String get(final String key) {
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java
@ -7,6 +7,7 @@ public class OptionsParameter {
    private String paramLongName;
    private String paramDescription;
    private boolean paramRequired;
+    private boolean compressed;

    public OptionsParameter() {
    }
@ -26,4 +27,12 @@ public class OptionsParameter {
    public boolean isParamRequired() {
        return paramRequired;
    }
+
+    public boolean isCompressed() {
+        return compressed;
+    }
+
+    public void setCompressed(boolean compressed) {
+        this.compressed = compressed;
+    }
 }
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java
@ -3,6 +3,10 @@ package eu.dnetlib.dhp.application;
 import org.apache.commons.io.IOUtils;
 import org.junit.Test;

+import java.io.ByteArrayOutputStream;
+import java.util.Base64;
+import java.util.zip.GZIPOutputStream;
+
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;

@ -24,6 +28,7 @@ public class ArgumentApplicationParserTest {
                "-ro", "value7",
                "-rr", "value8",
                "-w", "value9",
+                "-cc", ArgumentApplicationParser.compressArgument(jsonConfiguration)
        });
        assertNotNull(parser.get("hdfsPath"));
        assertNotNull(parser.get("apidescriptor"));
@ -45,7 +50,12 @@ public class ArgumentApplicationParserTest {
        assertEquals("value7", parser.get("rabbitOngoingQueue"));
        assertEquals("value8", parser.get("rabbitReportQueue"));
        assertEquals("value9", parser.get("workflowId"));
+        assertEquals(jsonConfiguration, parser.get("ccCoco"));
    }


+
+
+
+
 }
--- a/dhp-common/src/test/resources/eu/dnetlib/application/parameters.json
+++ b/dhp-common/src/test/resources/eu/dnetlib/application/parameters.json
@ -1,12 +1,13 @@
 [
-  {"paramName":"p",   "paramLongName":"hdfsPath",           "paramDescription": "the path where storing the sequential file",           "paramRequired": true},
-  {"paramName":"a",   "paramLongName":"apidescriptor",      "paramDescription": "the JSON encoding of the API Descriptor",              "paramRequired": true},
-  {"paramName":"n",   "paramLongName":"namenode",           "paramDescription": "the Name Node URI",                                    "paramRequired": true},
-  {"paramName":"u",   "paramLongName":"userHDFS",           "paramDescription": "the user wich create the hdfs seq file",               "paramRequired": true},
-  {"paramName":"ru",  "paramLongName":"rabbitUser",         "paramDescription": "the user to connect with RabbitMq for messaging",      "paramRequired": true},
-  {"paramName":"rp",  "paramLongName":"rabbitPassWord",     "paramDescription": "the password to connect with RabbitMq for messaging",  "paramRequired": true},
-  {"paramName":"rh",  "paramLongName":"rabbitHost",         "paramDescription": "the host of the RabbitMq server",                      "paramRequired": true},
-  {"paramName":"ro",  "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue",                        "paramRequired": true},
-  {"paramName":"rr",  "paramLongName":"rabbitReportQueue",  "paramDescription": "the name of the report queue",                         "paramRequired": true},
-  {"paramName":"w",   "paramLongName":"workflowId",         "paramDescription": "the identifier of the dnet Workflow",                  "paramRequired": true}
+  {"paramName":"p",   "paramLongName":"hdfsPath",           "paramDescription": "the path where storing the sequential file",            "paramRequired": true},
+  {"paramName":"a",   "paramLongName":"apidescriptor",      "paramDescription": "the JSON encoding of the API Descriptor",               "paramRequired": true},
+  {"paramName":"n",   "paramLongName":"namenode",           "paramDescription": "the Name Node URI",                                     "paramRequired": true},
+  {"paramName":"u",   "paramLongName":"userHDFS",           "paramDescription": "the user wich create the hdfs seq file",                "paramRequired": true},
+  {"paramName":"ru",  "paramLongName":"rabbitUser",         "paramDescription": "the user to connect with RabbitMq for messaging",       "paramRequired": true},
+  {"paramName":"rp",  "paramLongName":"rabbitPassWord",     "paramDescription": "the password to connect with RabbitMq for messaging",   "paramRequired": true},
+  {"paramName":"rh",  "paramLongName":"rabbitHost",         "paramDescription": "the host of the RabbitMq server",                       "paramRequired": true},
+  {"paramName":"ro",  "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue",                         "paramRequired": true},
+  {"paramName":"rr",  "paramLongName":"rabbitReportQueue",  "paramDescription": "the name of the report queue",                          "paramRequired": true},
+  {"paramName":"w",   "paramLongName":"workflowId",         "paramDescription": "the identifier of the dnet Workflow",                   "paramRequired": true},
+  {"paramName":"cc",   "paramLongName":"ccCoco",            "paramDescription": "the identifier of the dnet Workflow", "compressed":true,"paramRequired": true}
 ]
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java
@ -23,4 +23,23 @@ public class Context implements Serializable {
    public void setDataInfo(List<DataInfo> dataInfo) {
        this.dataInfo = dataInfo;
    }
+
+    @Override
+    public int hashCode() {
+        return id ==null? 0 : id.hashCode();
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj)
+            return true;
+        if (obj == null)
+            return false;
+        if (getClass() != obj.getClass())
+            return false;
+
+        Context other = (Context) obj;
+
+        return id.equals(other.getId());
+    }
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Dataset.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Dataset.java
@ -19,7 +19,7 @@ public class Dataset extends Result implements Serializable {

    private List<GeoLocation> geolocation;

-    public Field<String> getStoragedate() {
+    public  Field<String> getStoragedate() {
        return storagedate;
    }

@ -74,4 +74,26 @@ public class Dataset extends Result implements Serializable {
    public void setGeolocation(List<GeoLocation> geolocation) {
        this.geolocation = geolocation;
    }
+
+    @Override
+    public void mergeFrom(OafEntity e) {
+        super.mergeFrom(e);
+        final Dataset d = (Dataset) e;
+
+        storagedate = d.getStoragedate() != null && compareTrust(this, e)<0? d.getStoragedate() : storagedate;
+
+        device= d.getDevice() != null && compareTrust(this, e)<0? d.getDevice() : device;
+
+        size= d.getSize() != null && compareTrust(this, e)<0? d.getSize() : size;
+
+        version= d.getVersion() != null && compareTrust(this, e)<0? d.getVersion() : version;
+
+        lastmetadataupdate= d.getLastmetadataupdate() != null && compareTrust(this, e)<0? d.getLastmetadataupdate() :lastmetadataupdate;
+
+        metadataversionnumber= d.getMetadataversionnumber() != null && compareTrust(this, e)<0? d.getMetadataversionnumber() : metadataversionnumber;
+
+        geolocation = mergeLists(geolocation, d.getGeolocation());
+
+        mergeOAFDataInfo(d);
+    }
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java
@ -78,7 +78,7 @@ public class Datasource extends OafEntity implements Serializable {

    private Field<String> certificates;

-    private List< KeyValue> policies;
+    private List<KeyValue> policies;

    private Journal journal;

@ -361,4 +361,67 @@ public class Datasource extends OafEntity implements Serializable {
    public void setJournal(Journal journal) {
        this.journal = journal;
    }
+
+    @Override
+    public void mergeFrom(OafEntity e) {
+        super.mergeFrom(e);
+
+        Datasource d = (Datasource)e;
+
+        datasourcetype = d.getDatasourcetype() != null && compareTrust(this, e)<0? d.getDatasourcetype() : datasourcetype;
+        openairecompatibility = d.getOpenairecompatibility() != null && compareTrust(this, e)<0? d.getOpenairecompatibility() : openairecompatibility;
+        officialname = d.getOfficialname() != null && compareTrust(this, e)<0? d.getOfficialname() : officialname;
+        englishname = d.getEnglishname() != null && compareTrust(this, e)<0? d.getEnglishname() : officialname;
+        websiteurl = d.getWebsiteurl() != null && compareTrust(this, e)<0? d.getWebsiteurl() : websiteurl;
+        logourl = d.getLogourl() != null && compareTrust(this, e)<0? d.getLogourl() : getLogourl();
+        contactemail = d.getContactemail() != null && compareTrust(this, e)<0? d.getContactemail() : contactemail;
+        namespaceprefix = d.getNamespaceprefix() != null && compareTrust(this, e)<0? d.getNamespaceprefix() : namespaceprefix;
+        latitude = d.getLatitude() != null && compareTrust(this, e)<0? d.getLatitude() : latitude;
+        longitude = d.getLongitude() != null && compareTrust(this, e)<0? d.getLongitude() : longitude;
+        dateofvalidation = d.getDateofvalidation() != null && compareTrust(this, e)<0? d.getDateofvalidation() : dateofvalidation;
+        description = d.getDescription() != null && compareTrust(this, e)<0? d.getDescription() : description;
+        subjects = mergeLists(subjects, d.getSubjects());
+
+        // opendoar specific fields (od*)
+        odnumberofitems = d.getOdnumberofitems() != null && compareTrust(this, e)<0? d.getOdnumberofitems() : odnumberofitems;
+        odnumberofitemsdate = d.getOdnumberofitemsdate() != null && compareTrust(this, e)<0? d.getOdnumberofitemsdate() : odnumberofitemsdate;
+        odpolicies = d.getOdpolicies() != null && compareTrust(this, e)<0? d.getOdpolicies() : odpolicies;
+        odlanguages = mergeLists(odlanguages, d.getOdlanguages());
+        odcontenttypes = mergeLists(odcontenttypes, d.getOdcontenttypes());
+        accessinfopackage = mergeLists(accessinfopackage, d.getAccessinfopackage());
+
+        // re3data fields
+        releasestartdate = d.getReleasestartdate() != null && compareTrust(this, e)<0? d.getReleasestartdate() : releasestartdate;
+        releaseenddate = d.getReleaseenddate() != null && compareTrust(this, e)<0? d.getReleaseenddate() : releaseenddate;
+        missionstatementurl = d.getMissionstatementurl() != null && compareTrust(this, e)<0? d.getMissionstatementurl() : missionstatementurl;
+        dataprovider = d.getDataprovider() != null && compareTrust(this, e)<0? d.getDataprovider() : dataprovider;
+        serviceprovider = d.getServiceprovider() != null && compareTrust(this, e)<0? d.getServiceprovider() : serviceprovider;
+
+        // {open, restricted or closed}
+        databaseaccesstype = d.getDatabaseaccesstype() != null && compareTrust(this, e)<0? d.getDatabaseaccesstype() : databaseaccesstype;
+
+        // {open, restricted or closed}
+        datauploadtype = d.getDatauploadtype() != null && compareTrust(this, e)<0? d.getDatauploadtype() : datauploadtype;
+
+        // {feeRequired, registration, other}
+        databaseaccessrestriction = d.getDatabaseaccessrestriction() != null && compareTrust(this, e)<0? d.getDatabaseaccessrestriction() : databaseaccessrestriction;
+
+        // {feeRequired, registration, other}
+        datauploadrestriction = d.getDatauploadrestriction() != null && compareTrust(this, e)<0? d.getDatauploadrestriction() : datauploadrestriction;
+
+        versioning = d.getVersioning() != null && compareTrust(this, e)<0? d.getVersioning() : versioning;
+        citationguidelineurl = d.getCitationguidelineurl() != null && compareTrust(this, e)<0? d.getCitationguidelineurl() : citationguidelineurl;
+
+        //{yes, no, unknown}
+        qualitymanagementkind = d.getQualitymanagementkind() != null && compareTrust(this, e)<0? d.getQualitymanagementkind() : qualitymanagementkind;
+        pidsystems = d.getPidsystems() != null && compareTrust(this, e)<0? d.getPidsystems() : pidsystems;
+
+        certificates = d.getCertificates() != null && compareTrust(this, e)<0? d.getCertificates() : certificates;
+
+        policies = mergeLists(policies, d.getPolicies());
+
+        journal = d.getJournal() != null && compareTrust(this, e)<0? d.getJournal() : journal;
+
+        mergeOAFDataInfo(e);
+    }
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java
@ -23,4 +23,21 @@ public class Field<T> implements Serializable {
    public void setDataInfo(DataInfo dataInfo) {
        this.dataInfo = dataInfo;
    }
+
+    @Override
+    public int hashCode() {
+        return getValue() == null ? 0 : getValue().hashCode();
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj)
+            return true;
+        if (obj == null)
+            return false;
+        if (getClass() != obj.getClass())
+            return false;
+        Field<T> other = (Field<T>) obj;
+        return getValue().equals(other.getValue());
+    }
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java
@ -1,5 +1,7 @@
 package eu.dnetlib.dhp.schema.oaf;

+import org.apache.commons.lang3.StringUtils;
+
 import java.io.Serializable;

 public class GeoLocation implements Serializable {
@ -33,4 +35,35 @@ public class GeoLocation implements Serializable {
    public void setPlace(String place) {
        this.place = place;
    }
+
+
+    public boolean isBlank() {
+        return StringUtils.isBlank(point) &&
+                StringUtils.isBlank(box) &&
+                StringUtils.isBlank(place);
+    }
+
+    public String toComparableString() {
+        return isBlank()?"":String.format("%s::%s%s", point != null ? point.toLowerCase() : "", box != null ? box.toLowerCase() : "", place != null ? place.toLowerCase() : "");
+    }
+
+    @Override
+    public int hashCode() {
+        return toComparableString().hashCode();
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj)
+            return true;
+        if (obj == null)
+            return false;
+        if (getClass() != obj.getClass())
+            return false;
+
+        GeoLocation other = (GeoLocation) obj;
+
+        return toComparableString()
+                .equals(other.toComparableString());
+    }
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java
@ -85,4 +85,34 @@ public class Instance implements Serializable {
    public void setDateofacceptance(Field<String> dateofacceptance) {
        this.dateofacceptance = dateofacceptance;
    }
+
+
+
+    public String toComparableString(){
+        return String.format("%s::%s::%s::%s",
+                hostedby != null && hostedby.getKey()!= null  ? hostedby.getKey().toLowerCase() : "",
+                accessright!= null && accessright.getClassid()!= null ? accessright.getClassid() : "",
+                instancetype!= null && instancetype.getClassid()!= null ? instancetype.getClassid() : "",
+                url != null ? url:"");
+    }
+
+    @Override
+    public int hashCode() {
+        return toComparableString().hashCode();
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj)
+            return true;
+        if (obj == null)
+            return false;
+        if (getClass() != obj.getClass())
+            return false;
+
+        Instance other = (Instance) obj;
+
+        return toComparableString()
+                .equals(other.toComparableString());
+    }
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java
@ -1,5 +1,7 @@
 package eu.dnetlib.dhp.schema.oaf;

+import org.apache.commons.lang3.StringUtils;
+
 import java.io.Serializable;

 public class KeyValue implements Serializable {
@ -33,4 +35,31 @@ public class KeyValue implements Serializable {
    public void setDataInfo(DataInfo dataInfo) {
        this.dataInfo = dataInfo;
    }
+
+    public String toComparableString() {
+        return isBlank()?"":String.format("%s::%s", key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : "");
+    }
+
+    public boolean isBlank() {
+        return StringUtils.isBlank(key) && StringUtils.isBlank(value);
+    }
+
+    @Override
+    public int hashCode() {
+        return toComparableString().hashCode();
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj)
+            return true;
+        if (obj == null)
+            return false;
+        if (getClass() != obj.getClass())
+            return false;
+
+        KeyValue other = (KeyValue) obj;
+
+        return toComparableString().equals(other.toComparableString());
+    }
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java
@ -1,8 +1,5 @@
 package eu.dnetlib.dhp.schema.oaf;

-import com.fasterxml.jackson.core.JsonProcessingException;
-import com.fasterxml.jackson.databind.ObjectMapper;
-
 import java.io.Serializable;

 public abstract class Oaf implements Serializable {
@ -27,13 +24,23 @@ public abstract class Oaf implements Serializable {
        this.lastupdatetimestamp = lastupdatetimestamp;
    }

-    @Override
-    public String toString() {
-        try {
-            return new ObjectMapper().writeValueAsString(this);
-        } catch (JsonProcessingException e) {
-            throw new RuntimeException(e);
-        }
+
+    public void mergeOAFDataInfo(Oaf e) {
+        if (e.getDataInfo()!= null && compareTrust(this,e)<0)
+            dataInfo = e.getDataInfo();
    }

+    protected String extractTrust(Oaf e) {
+        if (e == null || e.getDataInfo()== null || e.getDataInfo().getTrust()== null)
+            return  "0.0";
+        return e.getDataInfo().getTrust();
+
+
+
+    }
+
+    protected int compareTrust(Oaf a, Oaf b) {
+        return extractTrust(a).compareTo(extractTrust(b));
+
+    }
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java
@ -1,7 +1,8 @@
 package eu.dnetlib.dhp.schema.oaf;

 import java.io.Serializable;
-import java.util.List;
+import java.util.*;
+import java.util.stream.Collectors;

 public abstract class OafEntity extends Oaf implements Serializable {

@ -84,4 +85,36 @@ public abstract class OafEntity extends Oaf implements Serializable {
    public void setOaiprovenance(OAIProvenance oaiprovenance) {
        this.oaiprovenance = oaiprovenance;
    }
+
+
+    public void mergeFrom(OafEntity e) {
+
+        if (e == null)
+            return;
+
+        originalId = mergeLists(originalId, e.getOriginalId());
+
+        collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom());
+
+        pid = mergeLists(pid, e.getPid());
+
+        if (e.getDateofcollection() != null && compareTrust(this, e) < 0)
+            dateofcollection = e.getDateofcollection();
+
+        if (e.getDateoftransformation() != null && compareTrust(this, e) < 0)
+            dateoftransformation = e.getDateoftransformation();
+
+        extraInfo = mergeLists(extraInfo, e.getExtraInfo());
+
+        if (e.getOaiprovenance() != null && compareTrust(this, e) < 0)
+            oaiprovenance = e.getOaiprovenance();
+
+    }
+
+    protected <T> List<T> mergeLists(final List<T>... lists) {
+
+        return Arrays.stream(lists).filter(Objects::nonNull).flatMap(List::stream).distinct().collect(Collectors.toList());
+    }
+
+
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Organization.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Organization.java
@ -164,4 +164,28 @@ public class Organization extends OafEntity implements Serializable {
    public void setCountry(Qualifier country) {
        this.country = country;
    }
+
+
+    @Override
+    public void mergeFrom(OafEntity e) {
+        super.mergeFrom(e);
+        final Organization o = (Organization) e;
+        legalshortname = o.getLegalshortname() != null && compareTrust(this, e)<0? o.getLegalshortname() : legalshortname;
+        legalname = o.getLegalname() != null && compareTrust(this, e)<0 ? o.getLegalname() : legalname;
+        alternativeNames = mergeLists(o.getAlternativeNames(), alternativeNames);
+        websiteurl = o.getWebsiteurl() != null && compareTrust(this, e)<0? o.getWebsiteurl() : websiteurl;
+        logourl = o.getLogourl() != null && compareTrust(this, e)<0? o.getLogourl() : logourl;
+        eclegalbody = o.getEclegalbody() != null && compareTrust(this, e)<0? o.getEclegalbody() : eclegalbody;
+        eclegalperson = o.getEclegalperson() != null && compareTrust(this, e)<0? o.getEclegalperson() : eclegalperson;
+        ecnonprofit = o.getEcnonprofit() != null && compareTrust(this, e)<0? o.getEcnonprofit() : ecnonprofit;
+        ecresearchorganization = o.getEcresearchorganization() != null && compareTrust(this, e)<0? o.getEcresearchorganization() : ecresearchorganization;
+        echighereducation = o.getEchighereducation() != null && compareTrust(this, e)<0? o.getEchighereducation() : echighereducation;
+        ecinternationalorganizationeurinterests = o.getEcinternationalorganizationeurinterests() != null && compareTrust(this, e)<0? o.getEcinternationalorganizationeurinterests() : ecinternationalorganizationeurinterests;
+        ecinternationalorganization = o.getEcinternationalorganization() != null && compareTrust(this, e)<0? o.getEcinternationalorganization() : ecinternationalorganization;
+        ecenterprise = o.getEcenterprise() != null && compareTrust(this, e)<0? o.getEcenterprise() :ecenterprise;
+        ecsmevalidated = o.getEcsmevalidated() != null && compareTrust(this, e)<0? o.getEcsmevalidated() :ecsmevalidated;
+        ecnutscode = o.getEcnutscode() != null && compareTrust(this, e)<0? o.getEcnutscode() :ecnutscode;
+        country = o.getCountry() != null && compareTrust(this, e)<0 ? o.getCountry() :country;
+        mergeOAFDataInfo(o);
+    }
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OtherResearchProduct.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OtherResearchProduct.java
@ -34,4 +34,16 @@ public class OtherResearchProduct extends Result implements Serializable {
    public void setTool(List<Field<String>> tool) {
        this.tool = tool;
    }
+
+    @Override
+    public void mergeFrom(OafEntity e) {
+        super.mergeFrom(e);
+
+        OtherResearchProduct o = (OtherResearchProduct)e;
+
+        contactperson = mergeLists(contactperson, o.getContactperson());
+        contactgroup = mergeLists(contactgroup, o.getContactgroup());
+        tool = mergeLists(tool, o.getTool());
+        mergeOAFDataInfo(e);
+    }
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java
@ -264,4 +264,39 @@ public class Project extends OafEntity implements Serializable {
    public void setFundedamount(Float fundedamount) {
        this.fundedamount = fundedamount;
    }
+
+
+    @Override
+    public void mergeFrom(OafEntity e) {
+        super.mergeFrom(e);
+        Project p = (Project)e;
+
+            websiteurl= p.getWebsiteurl()!= null && compareTrust(this,e)<0?p.getWebsiteurl():websiteurl;
+            code= p.getCode()!=null && compareTrust(this,e)<0?p.getCode():code;
+            acronym= p.getAcronym()!= null && compareTrust(this,e)<0?p.getAcronym():acronym;
+            title= p.getTitle()!= null && compareTrust(this,e)<0?p.getTitle():title;
+            startdate= p.getStartdate()!=null && compareTrust(this,e)<0?p.getStartdate():startdate;
+            enddate= p.getEnddate()!=null && compareTrust(this,e)<0?p.getEnddate():enddate;
+            callidentifier= p.getCallidentifier()!=null && compareTrust(this,e)<0?p.getCallidentifier():callidentifier;
+            keywords= p.getKeywords()!=null && compareTrust(this,e)<0?p.getKeywords():keywords;
+            duration= p.getDuration()!=null && compareTrust(this,e)<0?p.getDuration():duration;
+            ecsc39= p.getEcsc39()!=null && compareTrust(this,e)<0?p.getEcsc39():ecsc39;
+            oamandatepublications= p.getOamandatepublications()!=null && compareTrust(this,e)<0?p.getOamandatepublications():oamandatepublications;
+            ecarticle29_3= p.getEcarticle29_3()!=null && compareTrust(this,e)<0?p.getEcarticle29_3():ecarticle29_3;
+            subjects= mergeLists(subjects, p.getSubjects());
+            fundingtree= mergeLists(fundingtree, p.getFundingtree());
+            contracttype= p.getContracttype()!=null && compareTrust(this,e)<0?p.getContracttype():contracttype;
+            optional1= p.getOptional1()!=null && compareTrust(this,e)<0?p.getOptional1():optional1;
+            optional2= p.getOptional2()!=null && compareTrust(this,e)<0?p.getOptional2():optional2;
+            jsonextrainfo= p.getJsonextrainfo()!=null && compareTrust(this,e)<0?p.getJsonextrainfo():jsonextrainfo;
+            contactfullname= p.getContactfullname()!=null && compareTrust(this,e)<0?p.getContactfullname():contactfullname;
+            contactfax= p.getContactfax()!=null && compareTrust(this,e)<0?p.getContactfax():contactfax;
+            contactphone= p.getContactphone()!=null && compareTrust(this,e)<0?p.getContactphone():contactphone;
+            contactemail= p.getContactemail()!=null && compareTrust(this,e)<0?p.getContactemail():contactemail;
+            summary= p.getSummary()!=null && compareTrust(this,e)<0?p.getSummary():summary;
+            currency= p.getCurrency()!=null && compareTrust(this,e)<0?p.getCurrency():currency;
+            totalcost= p.getTotalcost()!=null && compareTrust(this,e)<0?p.getTotalcost():totalcost;
+            fundedamount= p.getFundedamount()!= null && compareTrust(this,e)<0?p.getFundedamount():fundedamount;
+            mergeOAFDataInfo(e);
+    }
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java
@ -14,4 +14,17 @@ public class Publication extends Result implements Serializable {
    public void setJournal(Journal journal) {
        this.journal = journal;
    }
+
+    @Override
+    public void mergeFrom(OafEntity e) {
+        super.mergeFrom(e);
+
+        Publication p = (Publication) e;
+
+        if (p.getJournal() != null && compareTrust(this, e)<0)
+            journal = p.getJournal();
+        mergeOAFDataInfo(e);
+    }
+
+
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java
@ -1,5 +1,7 @@
 package eu.dnetlib.dhp.schema.oaf;

+import org.apache.commons.lang3.StringUtils;
+
 import java.io.Serializable;

 public class Qualifier implements Serializable {
@ -40,4 +42,37 @@ public class Qualifier implements Serializable {
    public void setSchemename(String schemename) {
        this.schemename = schemename;
    }
+
+    public String toComparableString() {
+        return isBlank()?"": String.format("%s::%s::%s::%s",
+                classid != null ? classid : "",
+                classname != null ? classname : "",
+                schemeid != null ? schemeid : "",
+                schemename != null ? schemename : "");
+    }
+    public boolean isBlank() {
+        return StringUtils.isBlank(classid) &&
+                StringUtils.isBlank(classname) &&
+                StringUtils.isBlank(schemeid) &&
+                StringUtils.isBlank(schemename);
+    }
+    @Override
+    public int hashCode() {
+        return toComparableString().hashCode();
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj)
+            return true;
+        if (obj == null)
+            return false;
+        if (getClass() != obj.getClass())
+            return false;
+
+        Qualifier other = (Qualifier) obj;
+
+        return toComparableString()
+                .equals(other.toComparableString());
+    }
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java
@ -1,7 +1,10 @@
 package eu.dnetlib.dhp.schema.oaf;

+import org.apache.commons.lang3.StringUtils;
+
 import java.io.Serializable;
-import java.util.List;
+import java.util.*;
+import java.util.stream.Collectors;

 public abstract class Result extends OafEntity implements Serializable {

@ -12,35 +15,35 @@ public abstract class Result extends OafEntity implements Serializable {

    // common fields
    private Qualifier language;
-    
+
    private List<Qualifier> country;

    private List<StructuredProperty> subject;
-    
+
    private List<StructuredProperty> title;
-    
+
    private List<StructuredProperty> relevantdate;

    private List<Field<String>> description;
-    
+
    private Field<String> dateofacceptance;
-        
+
    private Field<String> publisher;
-    
+
    private Field<String> embargoenddate;
-    
+
    private List<Field<String>> source;
-    
+
    private List<Field<String>> fulltext; // remove candidate
-    
+
    private List<Field<String>> format;
-    
+
    private List<Field<String>> contributor;
-    
+
    private Qualifier resourcetype;
-    
+
    private List<Field<String>> coverage;
-    
+
    private Field<String> refereed; //peer-review status

    private List<Context> context;
@ -240,4 +243,76 @@ public abstract class Result extends OafEntity implements Serializable {
        this.processingchargecurrency = processingchargecurrency;
        return this;
    }
+
+    @Override
+    public void mergeFrom(OafEntity e) {
+        super.mergeFrom(e);
+
+        Result r = (Result) e;
+
+        instance = mergeLists(instance, r.getInstance());
+
+        if (r.getResulttype() != null && compareTrust(this, r) < 0)
+            resulttype = r.getResulttype();
+
+        if (r.getLanguage() != null && compareTrust(this, r) < 0)
+            language = r.getLanguage();
+
+        country = mergeLists(country, r.getCountry());
+
+        subject = mergeLists(subject, r.getSubject());
+
+        title = mergeLists(title, r.getTitle());
+
+        relevantdate = mergeLists(relevantdate, r.getRelevantdate());
+
+        description = longestLists(description, r.getDescription());
+
+        if (r.getPublisher() != null && compareTrust(this, r) < 0)
+            publisher = r.getPublisher();
+
+        if (r.getEmbargoenddate() != null && compareTrust(this, r) < 0)
+            embargoenddate = r.getEmbargoenddate();
+
+        source = mergeLists(source, r.getSource());
+
+        fulltext = mergeLists(fulltext, r.getFulltext());
+
+        format = mergeLists(format, r.getFormat());
+
+        contributor = mergeLists(contributor, r.getContributor());
+
+        if (r.getResourcetype() != null)
+            resourcetype = r.getResourcetype();
+
+        coverage = mergeLists(coverage, r.getCoverage());
+
+        if (r.getRefereed() != null && compareTrust(this, r) < 0)
+            refereed = r.getRefereed();
+
+        context = mergeLists(context, r.getContext());
+
+        if (r.getProcessingchargeamount() != null && compareTrust(this, r) < 0)
+            processingchargeamount = r.getProcessingchargeamount();
+
+        if (r.getProcessingchargecurrency() != null && compareTrust(this, r) < 0)
+            processingchargecurrency = r.getProcessingchargecurrency();
+
+        externalReference = mergeLists(externalReference, r.getExternalReference());
+
+    }
+
+
+    private List<Field<String>> longestLists(List<Field<String>> a, List<Field<String>> b) {
+        if (a == null || b == null)
+            return a == null ? b : a;
+        if (a.size() == b.size()) {
+            int msa = a.stream().filter(i -> i.getValue() != null).map(i -> i.getValue().length()).max(Comparator.naturalOrder()).orElse(0);
+            int msb = b.stream().filter(i -> i.getValue() != null).map(i -> i.getValue().length()).max(Comparator.naturalOrder()).orElse(0);
+            return msa > msb ? a : b;
+        }
+        return a.size() > b.size() ? a : b;
+    }
+
+
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Software.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Software.java
@ -44,4 +44,19 @@ public class Software extends Result implements Serializable {
    public void setProgrammingLanguage(Qualifier programmingLanguage) {
        this.programmingLanguage = programmingLanguage;
    }
+
+    @Override
+    public void mergeFrom(OafEntity e) {
+        super.mergeFrom(e);
+        final Software s = (Software) e;
+        documentationUrl = mergeLists(documentationUrl, s.getDocumentationUrl());
+
+        license = mergeLists(license, s.getLicense());
+
+        codeRepositoryUrl = s.getCodeRepositoryUrl()!= null && compareTrust(this, s)<0?s.getCodeRepositoryUrl():codeRepositoryUrl;
+
+        programmingLanguage= s.getProgrammingLanguage()!= null && compareTrust(this, s)<0?s.getProgrammingLanguage():programmingLanguage;
+
+        mergeOAFDataInfo(e);
+    }
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java
@ -33,4 +33,28 @@ public class StructuredProperty implements Serializable {
    public void setDataInfo(DataInfo dataInfo) {
        this.dataInfo = dataInfo;
    }
+
+    public String toComparableString(){
+        return  value != null ? value.toLowerCase() : "";
+    }
+
+    @Override
+    public int hashCode() {
+        return toComparableString().hashCode();
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj)
+            return true;
+        if (obj == null)
+            return false;
+        if (getClass() != obj.getClass())
+            return false;
+
+        StructuredProperty other = (StructuredProperty) obj;
+
+        return toComparableString()
+                .equals(other.toComparableString());
+    }
 }
--- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java
+++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java
@ -0,0 +1,89 @@
+package eu.dnetlib.dhp.schema.oaf;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public class MergeTest {
+
+    OafEntity oaf;
+
+    @Before
+    public void setUp() {
+        oaf = new Publication();
+    }
+
+    @Test
+    public void mergeListsTest() {
+
+        //string list merge test
+        List<String> a = Arrays.asList("a", "b", "c", "e");
+        List<String> b = Arrays.asList("a", "b", "c", "d");
+        List<String> c = null;
+
+        System.out.println("merge result 1 = " + oaf.mergeLists(a, b));
+
+        System.out.println("merge result 2 = " + oaf.mergeLists(a, c));
+
+        System.out.println("merge result 3 = " + oaf.mergeLists(c, c));
+    }
+
+    @Test
+    public void mergePublicationCollectedFromTest() {
+
+        Publication a = new Publication();
+        Publication b = new Publication();
+
+        a.setCollectedfrom(Arrays.asList(setKV("a", "open"), setKV("b", "closed")));
+        b.setCollectedfrom(Arrays.asList(setKV("A", "open"), setKV("b", "Open")));
+
+        a.mergeFrom(b);
+
+        Assert.assertNotNull(a.getCollectedfrom());
+        Assert.assertEquals(3, a.getCollectedfrom().size());
+
+    }
+
+    @Test
+    public void mergePublicationSubjectTest() {
+
+        Publication a = new Publication();
+        Publication b = new Publication();
+
+        a.setSubject(Arrays.asList(setSP("a", "open", "classe"), setSP("b", "open", "classe")));
+        b.setSubject(Arrays.asList(setSP("A", "open", "classe"), setSP("c", "open", "classe")));
+
+        a.mergeFrom(b);
+
+        Assert.assertNotNull(a.getSubject());
+        Assert.assertEquals(3, a.getSubject().size());
+
+    }
+
+    private KeyValue setKV(final String key, final String value) {
+
+        KeyValue k = new KeyValue();
+
+        k.setKey(key);
+        k.setValue(value);
+
+        return k;
+    }
+
+    private StructuredProperty setSP(final String value, final String schema, final String classname) {
+        StructuredProperty s = new StructuredProperty();
+        s.setValue(value);
+        Qualifier q = new Qualifier();
+        q.setClassname(classname);
+        q.setClassid(classname);
+        q.setSchemename(schema);
+        q.setSchemeid(schema);
+        s.setQualifier(q);
+        return s;
+    }
+}
--- a/dhp-workflows/dhp-aggregation/pom.xml
+++ b/dhp-workflows/dhp-aggregation/pom.xml
@ -7,6 +7,8 @@
        <version>1.0.5-SNAPSHOT</version>
    </parent>
    <artifactId>dhp-aggregation</artifactId>
+    
+  
    <dependencies>

        <dependency>
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java
@ -89,6 +89,8 @@ public class TransformationJobTest {
                "-rh",  "",
                "-ro",  "",
                "-rr",  ""});
+
+
    }

    @Test
@ -96,7 +98,7 @@ public class TransformationJobTest {
        final String path = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile();
        System.out.println("path = " + path);

-        Path tempDirWithPrefix = Files.createTempDirectory("mdsotre_output");
+        Path tempDirWithPrefix = Files.createTempDirectory("mdstore_output");

        System.out.println(tempDirWithPrefix.toFile().getAbsolutePath());

--- a/dhp-workflows/dhp-dedup/pom.xml
+++ b/dhp-workflows/dhp-dedup/pom.xml
@ -0,0 +1,61 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>dhp-workflows</artifactId>
+        <groupId>eu.dnetlib.dhp</groupId>
+        <version>1.0.5-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <artifactId>dhp-dedup</artifactId>
+
+    <dependencies>
+
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-core_2.11</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-sql_2.11</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-common</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-schemas</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>com.arakelian</groupId>
+            <artifactId>java-jq</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>eu.dnetlib</groupId>
+            <artifactId>dnet-pace-core</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-graphx_2.11</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-core</artifactId>
+        </dependency>
+
+
+
+    </dependencies>
+
+
+</project>
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java
@ -0,0 +1,119 @@
+package eu.dnetlib.dedup;
+
+import eu.dnetlib.dhp.schema.oaf.Field;
+import org.apache.commons.lang.StringUtils;
+
+import java.time.Year;
+import java.util.*;
+import java.util.stream.Collectors;
+
+import static java.util.Collections.reverseOrder;
+import static java.util.Map.Entry.comparingByValue;
+import static java.util.stream.Collectors.toMap;
+import static org.apache.commons.lang.StringUtils.endsWith;
+import static org.apache.commons.lang.StringUtils.substringBefore;
+
+public class DatePicker {
+
+    private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";
+    private static final String DATE_DEFAULT_SUFFIX = "01-01";
+    private static final int YEAR_LB = 1300;
+    private static final int YEAR_UB = Year.now().getValue() + 5;
+
+    public static Field<String> pick(final Collection<String> dateofacceptance) {
+
+        final Map<String, Integer> frequencies = dateofacceptance
+                .parallelStream()
+                .filter(StringUtils::isNotBlank)
+                .collect(
+                        Collectors.toConcurrentMap(
+                                w -> w, w -> 1, Integer::sum));
+
+        if (frequencies.isEmpty()) {
+            return new Field<>();
+        }
+
+        final Field<String> date = new Field<>();
+                date.setValue(frequencies.keySet().iterator().next());
+
+        // let's sort this map by values first, filtering out invalid dates
+        final Map<String, Integer> sorted = frequencies
+                .entrySet()
+                .stream()
+                .filter(d -> StringUtils.isNotBlank(d.getKey()))
+                .filter(d -> d.getKey().matches(DATE_PATTERN))
+                .filter(d -> inRange(d.getKey()))
+                .sorted(reverseOrder(comparingByValue()))
+                .collect(
+                        toMap(
+                                Map.Entry::getKey,
+                                Map.Entry::getValue, (e1, e2) -> e2,
+                                LinkedHashMap::new));
+
+        // shortcut
+        if (sorted.size() == 0) {
+            return date;
+        }
+
+        // voting method (1/3 + 1) wins
+        if (sorted.size() >= 3) {
+            final int acceptThreshold = (sorted.size() / 3) + 1;
+            final List<String> accepted = sorted.entrySet().stream()
+                    .filter(e -> e.getValue() >= acceptThreshold)
+                    .map(e -> e.getKey())
+                    .collect(Collectors.toList());
+
+            // cannot find strong majority
+            if (accepted.isEmpty()) {
+                final int max = sorted.values().iterator().next();
+                Optional<String> first = sorted.entrySet().stream()
+                        .filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX))
+                        .map(Map.Entry::getKey)
+                        .findFirst();
+                if (first.isPresent()) {
+                    date.setValue(first.get());
+                    return date;
+                }
+
+                date.setValue(sorted.keySet().iterator().next());
+                return date;
+            }
+
+            if (accepted.size() == 1) {
+                date.setValue(accepted.get(0));
+                return date;
+            } else {
+                final Optional<String> first = accepted.stream()
+                        .filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX))
+                        .findFirst();
+                if (first.isPresent()) {
+                    date.setValue(first.get());
+                    return date;
+                }
+
+                return date;
+            }
+
+            //1st non YYYY-01-01 is returned
+        } else {
+            if (sorted.size() == 2) {
+                for (Map.Entry<String, Integer> e : sorted.entrySet()) {
+                    if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) {
+                        date.setValue(e.getKey());
+                        return date;
+                    }
+                }
+            }
+
+            // none of the dates seems good enough, return the 1st one
+            date.setValue(sorted.keySet().iterator().next());
+            return date;
+        }
+    }
+
+    private static boolean inRange(final String date) {
+        final int year = Integer.parseInt(substringBefore(date, "-"));
+        return year >= YEAR_LB && year <= YEAR_UB;
+    }
+
+}
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java
@ -0,0 +1,279 @@
+package eu.dnetlib.dedup;
+
+import com.google.common.collect.Lists;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.pace.config.DedupConfig;
+import eu.dnetlib.pace.util.MapDocumentUtil;
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.commons.lang.StringUtils;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.codehaus.jackson.map.ObjectMapper;
+import scala.Tuple2;
+
+import java.util.Collection;
+import java.util.Random;
+
+import static java.util.stream.Collectors.toMap;
+
+public class DedupRecordFactory {
+
+    public static JavaRDD<OafEntity> createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf) {
+        long ts = System.currentTimeMillis();
+        //<id, json_entity>
+        final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(entitiesInputPath)
+                .mapToPair((PairFunction<String, String, String>) it ->
+                        new Tuple2<String, String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it)
+                );
+
+        //<source, target>: source is the dedup_id, target is the id of the mergedIn
+        JavaPairRDD<String, String> mergeRels = spark
+                .read().load(mergeRelsInputPath).as(Encoders.bean(Relation.class))
+                .where("relClass=='merges'")
+                .javaRDD()
+                .mapToPair(
+                        (PairFunction<Relation, String, String>) r ->
+                                new Tuple2<String, String>(r.getTarget(), r.getSource())
+                );
+
+        //<dedup_id, json_entity_merged>
+        final JavaPairRDD<String, String> joinResult = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
+
+        JavaPairRDD<String, Iterable<String>> sortedJoinResult = joinResult.groupByKey();
+
+        switch (entityType) {
+            case publication:
+                return sortedJoinResult.map(p -> DedupRecordFactory.publicationMerger(p, ts));
+            case dataset:
+                return sortedJoinResult.map(d -> DedupRecordFactory.datasetMerger(d, ts));
+            case project:
+                return sortedJoinResult.map(p -> DedupRecordFactory.projectMerger(p, ts));
+            case software:
+                return sortedJoinResult.map(s -> DedupRecordFactory.softwareMerger(s, ts));
+            case datasource:
+                return sortedJoinResult.map(d -> DedupRecordFactory.datasourceMerger(d, ts));
+            case organization:
+                return sortedJoinResult.map(o -> DedupRecordFactory.organizationMerger(o, ts));
+            case otherresearchproduct:
+                return sortedJoinResult.map(o -> DedupRecordFactory.otherresearchproductMerger(o, ts));
+            default:
+                return null;
+        }
+
+    }
+
+    private static Publication publicationMerger(Tuple2<String, Iterable<String>> e, final long ts) {
+
+        Publication p = new Publication(); //the result of the merge, to be returned at the end
+
+        p.setId(e._1());
+
+        final ObjectMapper mapper = new ObjectMapper();
+
+        final Collection<String> dateofacceptance = Lists.newArrayList();
+
+        if (e._2() != null)
+            e._2().forEach(pub -> {
+                try {
+                    Publication publication = mapper.readValue(pub, Publication.class);
+
+                    p.mergeFrom(publication);
+                    p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor()));
+                    //add to the list if they are not null
+                    if (publication.getDateofacceptance() != null)
+                        dateofacceptance.add(publication.getDateofacceptance().getValue());
+                } catch (Exception exc) {
+                    throw new RuntimeException(exc);
+                }
+            });
+        p.setDateofacceptance(DatePicker.pick(dateofacceptance));
+        if (p.getDataInfo() == null)
+            p.setDataInfo(new DataInfo());
+        p.getDataInfo().setTrust("0.9");
+        p.setLastupdatetimestamp(ts);
+        return p;
+    }
+
+    private static Dataset datasetMerger(Tuple2<String, Iterable<String>> e, final long ts) {
+
+        Dataset d = new Dataset(); //the result of the merge, to be returned at the end
+
+        d.setId(e._1());
+
+        final ObjectMapper mapper = new ObjectMapper();
+
+        final Collection<String> dateofacceptance = Lists.newArrayList();
+
+        if (e._2() != null)
+            e._2().forEach(dat -> {
+                try {
+                    Dataset dataset = mapper.readValue(dat, Dataset.class);
+
+                    d.mergeFrom(dataset);
+                    d.setAuthor(DedupUtility.mergeAuthor(d.getAuthor(), dataset.getAuthor()));
+                    //add to the list if they are not null
+                    if (dataset.getDateofacceptance() != null)
+                        dateofacceptance.add(dataset.getDateofacceptance().getValue());
+                } catch (Exception exc) {
+                    throw new RuntimeException(exc);
+                }
+            });
+        d.setDateofacceptance(DatePicker.pick(dateofacceptance));
+        if (d.getDataInfo() == null)
+            d.setDataInfo(new DataInfo());
+        d.getDataInfo().setTrust("0.9");
+        d.setLastupdatetimestamp(ts);
+        return d;
+    }
+
+    private static Project projectMerger(Tuple2<String, Iterable<String>> e, final long ts) {
+
+        Project p = new Project(); //the result of the merge, to be returned at the end
+
+        p.setId(e._1());
+
+        final ObjectMapper mapper = new ObjectMapper();
+        if (e._2() != null)
+            e._2().forEach(proj -> {
+                try {
+                    Project project = mapper.readValue(proj, Project.class);
+
+                    p.mergeFrom(project);
+                } catch (Exception exc) {
+                    throw new RuntimeException(exc);
+                }
+            });
+        if (p.getDataInfo() == null)
+            p.setDataInfo(new DataInfo());
+        p.getDataInfo().setTrust("0.9");
+        p.setLastupdatetimestamp(ts);
+        return p;
+    }
+
+    private static Software softwareMerger(Tuple2<String, Iterable<String>> e, final long ts) {
+
+        Software s = new Software(); //the result of the merge, to be returned at the end
+
+        s.setId(e._1());
+        final ObjectMapper mapper = new ObjectMapper();
+        final Collection<String> dateofacceptance = Lists.newArrayList();
+        if (e._2() != null)
+            e._2().forEach(soft -> {
+                try {
+                    Software software = mapper.readValue(soft, Software.class);
+
+                    s.mergeFrom(software);
+                    s.setAuthor(DedupUtility.mergeAuthor(s.getAuthor(), software.getAuthor()));
+                    //add to the list if they are not null
+                    if (software.getDateofacceptance() != null)
+                        dateofacceptance.add(software.getDateofacceptance().getValue());
+                } catch (Exception exc) {
+                    throw new RuntimeException(exc);
+                }
+            });
+        s.setDateofacceptance(DatePicker.pick(dateofacceptance));
+        if (s.getDataInfo() == null)
+            s.setDataInfo(new DataInfo());
+        s.getDataInfo().setTrust("0.9");
+        s.setLastupdatetimestamp(ts);
+        return s;
+    }
+
+    private static Datasource datasourceMerger(Tuple2<String, Iterable<String>> e, final long ts) {
+        Datasource d = new Datasource(); //the result of the merge, to be returned at the end
+        d.setId(e._1());
+        final ObjectMapper mapper = new ObjectMapper();
+        if (e._2() != null)
+            e._2().forEach(dat -> {
+                try {
+                    Datasource datasource = mapper.readValue(dat, Datasource.class);
+
+                    d.mergeFrom(datasource);
+                } catch (Exception exc) {
+                    throw new RuntimeException(exc);
+                }
+            });
+        if (d.getDataInfo() == null)
+            d.setDataInfo(new DataInfo());
+        d.getDataInfo().setTrust("0.9");
+        d.setLastupdatetimestamp(ts);
+        return d;
+    }
+
+    private static Organization organizationMerger(Tuple2<String, Iterable<String>> e, final long ts) {
+
+        Organization o = new Organization(); //the result of the merge, to be returned at the end
+
+        o.setId(e._1());
+
+        final ObjectMapper mapper = new ObjectMapper();
+
+
+        StringBuilder trust = new StringBuilder("0.0");
+
+        if (e._2() != null)
+            e._2().forEach(pub -> {
+                try {
+                    Organization organization = mapper.readValue(pub, Organization.class);
+
+                    final String currentTrust = organization.getDataInfo().getTrust();
+                    if (!"1.0".equals(currentTrust)) {
+                        trust.setLength(0);
+                        trust.append(currentTrust);
+                    }
+                    o.mergeFrom(organization);
+
+                } catch (Exception exc) {
+                    throw new RuntimeException(exc);
+                }
+            });
+
+        if (o.getDataInfo() == null)
+        {
+            o.setDataInfo(new DataInfo());
+        }
+        if (o.getDataInfo() == null)
+            o.setDataInfo(new DataInfo());
+        o.getDataInfo().setTrust("0.9");
+        o.setLastupdatetimestamp(ts);
+
+        return o;
+    }
+
+    private static OtherResearchProduct otherresearchproductMerger(Tuple2<String, Iterable<String>> e, final long ts) {
+
+        OtherResearchProduct o = new OtherResearchProduct(); //the result of the merge, to be returned at the end
+
+        o.setId(e._1());
+
+        final ObjectMapper mapper = new ObjectMapper();
+
+        final Collection<String> dateofacceptance = Lists.newArrayList();
+
+        if (e._2() != null)
+            e._2().forEach(orp -> {
+                try {
+                    OtherResearchProduct otherResearchProduct = mapper.readValue(orp, OtherResearchProduct.class);
+
+                    o.mergeFrom(otherResearchProduct);
+                    o.setAuthor(DedupUtility.mergeAuthor(o.getAuthor(), otherResearchProduct.getAuthor()));
+                    //add to the list if they are not null
+                    if (otherResearchProduct.getDateofacceptance() != null)
+                        dateofacceptance.add(otherResearchProduct.getDateofacceptance().getValue());
+                } catch (Exception exc) {
+                    throw new RuntimeException(exc);
+                }
+            });
+        if (o.getDataInfo() == null)
+            o.setDataInfo(new DataInfo());
+        o.setDateofacceptance(DatePicker.pick(dateofacceptance));
+        o.getDataInfo().setTrust("0.9");
+        o.setLastupdatetimestamp(ts);
+        return o;
+    }
+
+}
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java
@ -0,0 +1,219 @@
+package eu.dnetlib.dedup;
+
+import com.google.common.collect.Sets;
+import com.wcohen.ss.JaroWinkler;
+import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
+import eu.dnetlib.pace.config.DedupConfig;
+
+import eu.dnetlib.pace.model.MapDocument;
+import eu.dnetlib.pace.model.Person;
+import org.apache.commons.codec.binary.Hex;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.spark.SparkContext;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.util.LongAccumulator;
+import scala.Tuple2;
+
+import java.io.IOException;
+import java.io.StringWriter;
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.text.Normalizer;
+import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+public class DedupUtility {
+    private static final Double THRESHOLD = 0.95;
+
+    public static Map<String, LongAccumulator> constructAccumulator(final DedupConfig dedupConf, final SparkContext context) {
+
+        Map<String, LongAccumulator> accumulators = new HashMap<>();
+
+        String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1");
+        accumulators.put(acc1, context.longAccumulator(acc1));
+        String acc2 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField());
+        accumulators.put(acc2, context.longAccumulator(acc2));
+        String acc3 = String.format("%s::%s", dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()));
+        accumulators.put(acc3, context.longAccumulator(acc3));
+        String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list");
+        accumulators.put(acc4, context.longAccumulator(acc4));
+        String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)");
+        accumulators.put(acc5, context.longAccumulator(acc5));
+        String acc6 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold());
+        accumulators.put(acc6, context.longAccumulator(acc6));
+
+        return accumulators;
+    }
+
+    public static JavaRDD<String> loadDataFromHDFS(String path, JavaSparkContext context) {
+        return context.textFile(path);
+    }
+
+    public static void deleteIfExists(String path) throws IOException {
+        Configuration conf = new Configuration();
+        FileSystem fileSystem = FileSystem.get(conf);
+        if (fileSystem.exists(new Path(path))) {
+            fileSystem.delete(new Path(path), true);
+        }
+    }
+
+    public static DedupConfig loadConfigFromHDFS(String path) throws IOException {
+
+        Configuration conf = new Configuration();
+        FileSystem fileSystem = FileSystem.get(conf);
+        FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path)));
+
+        return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name()));
+
+    }
+
+    static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
+        final StringWriter sw = new StringWriter();
+        try {
+            IOUtils.copy(clazz.getResourceAsStream(filename), sw);
+            return sw.toString();
+        } catch (final IOException e) {
+            throw new RuntimeException("cannot load resource from classpath: " + filename);
+        }
+    }
+
+    static Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
+        return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
+    }
+
+    public static String md5(final String s) {
+        try {
+            final MessageDigest md = MessageDigest.getInstance("MD5");
+            md.update(s.getBytes("UTF-8"));
+            return new String(Hex.encodeHex(md.digest()));
+        } catch (final Exception e) {
+            System.err.println("Error creating id");
+            return null;
+        }
+    }
+
+
+    public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
+        int pa = countAuthorsPids(a);
+        int pb = countAuthorsPids(b);
+        List<Author> base, enrich;
+        int sa = authorsSize(a);
+        int sb = authorsSize(b);
+
+        if (pa == pb) {
+            base = sa > sb ? a : b;
+            enrich = sa > sb ? b : a;
+        } else {
+            base = pa > pb ? a : b;
+            enrich = pa > pb ? b : a;
+        }
+        enrichPidFromList(base, enrich);
+        return base;
+    }
+
+    private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
+        if (base == null || enrich == null)
+            return;
+        final Map<String, Author> basePidAuthorMap = base.stream()
+                .filter(a -> a.getPid() != null && a.getPid().size() > 0)
+                .flatMap(a -> a.getPid()
+                        .stream()
+                        .map(p -> new Tuple2<>(p.toComparableString(), a))
+                ).collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
+
+        final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
+                .stream()
+                .filter(a -> a.getPid() != null && a.getPid().size() > 0)
+                .flatMap(a -> a.getPid().stream().filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())).map(p -> new Tuple2<>(p, a)))
+                .collect(Collectors.toList());
+
+
+        pidToEnrich.forEach(a -> {
+            Optional<Tuple2<Double, Author>> simAuhtor = base.stream().map(ba -> new Tuple2<>(sim(ba, a._2()), ba)).max(Comparator.comparing(Tuple2::_1));
+            if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) {
+                Author r = simAuhtor.get()._2();
+                r.getPid().add(a._1());
+            }
+        });
+    }
+
+    public static String createEntityPath(final String basePath, final String entityType) {
+        return String.format("%s/%s", basePath, entityType);
+    }
+
+    public static String createSimRelPath(final String basePath, final String entityType) {
+        return String.format("%s/%s_simRel", basePath, entityType);
+    }
+
+    public static String createMergeRelPath(final String basePath, final String entityType) {
+        return String.format("%s/%s_mergeRel", basePath, entityType);
+    }
+
+    private static Double sim(Author a, Author b) {
+
+        final Person pa = parse(a);
+        final Person pb = parse(b);
+
+        if (pa.isAccurate() & pb.isAccurate()) {
+            return new JaroWinkler().score(
+                    normalize(pa.getSurnameString()),
+                    normalize(pb.getSurnameString()));
+        } else {
+            return new JaroWinkler().score(
+                    normalize(pa.getNormalisedFullname()),
+                    normalize(pb.getNormalisedFullname()));
+        }
+    }
+
+    private static String normalize(final String s) {
+        return nfd(s).toLowerCase()
+                // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
+                .replaceAll("(\\W)+", " ")
+                .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
+                .replaceAll("(\\p{Punct})+", " ")
+                .replaceAll("(\\d)+", " ")
+                .replaceAll("(\\n)+", " ")
+                .trim();
+    }
+
+    private static String nfd(final String s) {
+        return Normalizer.normalize(s, Normalizer.Form.NFD);
+    }
+
+    private static Person parse(Author author) {
+        if (StringUtils.isNotBlank(author.getSurname())) {
+            return new Person(author.getSurname() + ", " + author.getName(), false);
+        } else {
+            return new Person(author.getFullname(), false);
+        }
+    }
+
+
+    private static int countAuthorsPids(List<Author> authors) {
+        if (authors == null)
+            return 0;
+
+        return (int) authors.stream().filter(DedupUtility::hasPid).count();
+    }
+
+    private static int authorsSize(List<Author> authors) {
+        if (authors == null)
+            return 0;
+        return authors.size();
+    }
+
+    private static boolean hasPid(Author a) {
+        if (a == null || a.getPid() == null || a.getPid().size() == 0)
+            return false;
+        return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
+    }
+}
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/Deduper.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/Deduper.java
@ -0,0 +1,162 @@
+package eu.dnetlib.dedup;
+
+import eu.dnetlib.pace.config.DedupConfig;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.MapDocument;
+import eu.dnetlib.pace.util.BlockProcessor;
+import eu.dnetlib.pace.util.MapDocumentUtil;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function2;
+import org.apache.spark.api.java.function.PairFlatMapFunction;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.util.LongAccumulator;
+import scala.Serializable;
+import scala.Tuple2;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+public class Deduper implements Serializable {
+
+    private static final Log log = LogFactory.getLog(Deduper.class);
+
+    /**
+     * @return the list of relations generated by the deduplication
+     * @param: the spark context
+     * @param: list of JSON entities to be deduped
+     * @param: the dedup configuration
+     */
+    public static JavaPairRDD<String, String> dedup(JavaSparkContext context, JavaRDD<String> entities, DedupConfig config) {
+
+        Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
+
+        //create vertexes of the graph: <ID, MapDocument>
+        JavaPairRDD<String, MapDocument> mapDocs = mapToVertexes(context, entities, config);
+
+
+        //create blocks for deduplication
+        JavaPairRDD<String, Iterable<MapDocument>> blocks = createBlocks(context, mapDocs, config);
+
+        //create relations by comparing only elements in the same group
+        return computeRelations(context, blocks, config);
+
+//        final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new Edge<>(it._1().hashCode(), it._2().hashCode(), "equalTo")).rdd();
+//
+//        RDD<Tuple2<Object, MapDocument>> vertexes = mapDocs.mapToPair((PairFunction<Tuple2<String, MapDocument>, Object, MapDocument>) t -> new Tuple2<Object, MapDocument>((long) t._1().hashCode(), t._2())).rdd();
+//        accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value()));
+//
+//        return GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD();
+    }
+
+    /**
+     * @return the list of relations generated by the deduplication
+     * @param: the spark context
+     * @param: list of blocks
+     * @param: the dedup configuration
+     */
+    public static JavaPairRDD<String, String> computeRelations(JavaSparkContext context, JavaPairRDD<String, Iterable<MapDocument>> blocks, DedupConfig config) {
+
+        Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
+
+        return blocks.flatMapToPair((PairFlatMapFunction<Tuple2<String, Iterable<MapDocument>>, String, String>) it -> {
+            final SparkReporter reporter = new SparkReporter(accumulators);
+            new BlockProcessor(config).process(it._1(), it._2(), reporter);
+            return reporter.getRelations().iterator();
+
+        }).mapToPair(
+                (PairFunction<Tuple2<String, String>, String, Tuple2<String, String>>) item ->
+                        new Tuple2<String, Tuple2<String, String>>(item._1() + item._2(), item))
+                .reduceByKey((a, b) -> a)
+                .mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
+    }
+
+
+    /**
+     * @return the list of blocks based on clustering of dedup configuration
+     * @param: the spark context
+     * @param: list of entities: <id, entity>
+     * @param: the dedup configuration
+     */
+    public static JavaPairRDD<String, Iterable<MapDocument>> createBlocks(JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
+        return mapDocs
+                //the reduce is just to be sure that we haven't document with same id
+                .reduceByKey((a, b) -> a)
+                .map(Tuple2::_2)
+                //Clustering: from <id, doc> to List<groupkey,doc>
+                .flatMapToPair((PairFlatMapFunction<MapDocument, String, MapDocument>) a ->
+                        DedupUtility.getGroupingKeys(config, a)
+                                .stream()
+                                .map(it -> new Tuple2<>(it, a))
+                                .collect(Collectors.toList())
+                                .iterator())
+                .groupByKey();
+    }
+
+
+    public static JavaPairRDD<String, List<MapDocument>> createsortedBlocks(JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
+        final String of = config.getWf().getOrderField();
+        final int maxQueueSize = config.getWf().getGroupMaxSize();
+        return mapDocs
+                //the reduce is just to be sure that we haven't document with same id
+                .reduceByKey((a, b) -> a)
+                .map(Tuple2::_2)
+                //Clustering: from <id, doc> to List<groupkey,doc>
+                .flatMapToPair((PairFlatMapFunction<MapDocument, String, List<MapDocument>>) a ->
+                        DedupUtility.getGroupingKeys(config, a)
+                                .stream()
+                                .map(it -> {
+                                            List<MapDocument> tmp = new ArrayList<>();
+                                            tmp.add(a);
+                                            return new Tuple2<>(it, tmp);
+                                        }
+                                )
+                                .collect(Collectors.toList())
+                                .iterator())
+                .reduceByKey((Function2<List<MapDocument>, List<MapDocument>, List<MapDocument>>) (v1, v2) -> {
+                    v1.addAll(v2);
+                    v1.sort(Comparator.comparing(a -> a.getFieldMap().get(of).stringValue()));
+                    if (v1.size() > maxQueueSize)
+                        return new ArrayList<>(v1.subList(0, maxQueueSize));
+                    return v1;
+                });
+    }
+
+    /**
+     * @return the list of vertexes: <id, mapDocument>
+     * @param: the spark context
+     * @param: list of JSON entities
+     * @param: the dedup configuration
+     */
+    public static JavaPairRDD<String, MapDocument> mapToVertexes(JavaSparkContext context, JavaRDD<String> entities, DedupConfig config) {
+
+        return entities.mapToPair((PairFunction<String, String, MapDocument>) s -> {
+
+            MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, s);
+            return new Tuple2<String, MapDocument>(mapDocument.getIdentifier(), mapDocument);
+
+
+        });
+    }
+
+    public static JavaPairRDD<String, String> computeRelations2(JavaSparkContext context, JavaPairRDD<String, List<MapDocument>> blocks, DedupConfig config) {
+        Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
+
+        return blocks.flatMapToPair((PairFlatMapFunction<Tuple2<String, List<MapDocument>>, String, String>) it -> {
+            try {
+                final SparkReporter reporter = new SparkReporter(accumulators);
+                new BlockProcessor(config).processSortedBlock(it._1(), it._2(), reporter);
+                return reporter.getRelations().iterator();
+            } catch (Exception e) {
+                throw new RuntimeException(it._2().get(0).getIdentifier(), e);
+            }
+        }).mapToPair(
+                (PairFunction<Tuple2<String, String>, String, Tuple2<String, String>>) item ->
+                        new Tuple2<String, Tuple2<String, String>>(item._1() + item._2(), item))
+                .reduceByKey((a, b) -> a)
+                .mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
+    }
+}
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java
@ -0,0 +1,15 @@
+package eu.dnetlib.dedup;
+
+public enum OafEntityType {
+
+    datasource,
+    organization,
+    project,
+    dataset,
+    otherresearchproduct,
+    software,
+    publication
+
+
+
+}
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java
@ -0,0 +1,79 @@
+package eu.dnetlib.dedup;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+import eu.dnetlib.dedup.graph.ConnectedComponent;
+import eu.dnetlib.dedup.graph.GraphProcessor;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.pace.config.DedupConfig;
+import eu.dnetlib.pace.util.MapDocumentUtil;
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.graphx.Edge;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import scala.Tuple2;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class SparkCreateConnectedComponent {
+
+    public static void main(String[] args) throws Exception {
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
+        parser.parseArgument(args);
+        final SparkSession spark = SparkSession
+                .builder()
+                .appName(SparkCreateConnectedComponent.class.getSimpleName())
+                .master(parser.get("master"))
+                .getOrCreate();
+
+        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+        final String inputPath = parser.get("sourcePath");
+        final String entity = parser.get("entity");
+        final String targetPath = parser.get("targetPath");
+//        final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
+        final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
+
+        final JavaPairRDD<Object, String> vertexes = sc.textFile(inputPath + "/" + entity)
+                .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
+                .mapToPair((PairFunction<String, Object, String>)
+                        s -> new Tuple2<Object, String>(getHashcode(s), s)
+                );
+
+        final Dataset<Relation> similarityRelations = spark.read().load(DedupUtility.createSimRelPath(targetPath,entity)).as(Encoders.bean(Relation.class));
+        final RDD<Edge<String>> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd();
+        final JavaRDD<ConnectedComponent> cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD();
+        final Dataset<Relation> mergeRelation = spark.createDataset(cc.filter(k->k.getDocIds().size()>1).flatMap((FlatMapFunction<ConnectedComponent, Relation>) c ->
+                c.getDocIds()
+                        .stream()
+                        .flatMap(id -> {
+                            List<Relation> tmp = new ArrayList<>();
+                            Relation r = new Relation();
+                            r.setSource(c.getCcId());
+                            r.setTarget(id);
+                            r.setRelClass("merges");
+                            tmp.add(r);
+                            r = new Relation();
+                            r.setTarget(c.getCcId());
+                            r.setSource(id);
+                            r.setRelClass("isMergedIn");
+                            tmp.add(r);
+                            return tmp.stream();
+                        }).iterator()).rdd(), Encoders.bean(Relation.class));
+        mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(targetPath,entity));
+    }
+
+    public  static long getHashcode(final String id) {
+        return Hashing.murmur3_128().hashUnencodedChars(id).asLong();
+    }
+}
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java
@ -0,0 +1,39 @@
+package eu.dnetlib.dedup;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.OafEntity;
+import eu.dnetlib.pace.config.DedupConfig;
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
+
+public class SparkCreateDedupRecord {
+
+    public static void main(String[] args) throws Exception {
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json")));
+        parser.parseArgument(args);
+        final SparkSession spark = SparkSession
+                .builder()
+                .appName(SparkCreateDedupRecord.class.getSimpleName())
+                .master(parser.get("master"))
+                .getOrCreate();
+
+        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+        final String sourcePath = parser.get("sourcePath");
+        final String entity = parser.get("entity");
+        final String dedupPath = parser.get("dedupPath");
+//        final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
+        final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
+
+        final JavaRDD<OafEntity> dedupRecord = DedupRecordFactory.createDedupRecord(sc, spark, DedupUtility.createMergeRelPath(dedupPath,entity), DedupUtility.createEntityPath(sourcePath,entity), OafEntityType.valueOf(entity), dedupConf);
+        dedupRecord.map(r-> {
+            ObjectMapper mapper = new ObjectMapper();
+            return mapper.writeValueAsString(r);
+        }).saveAsTextFile(dedupPath+"/"+entity+"_dedup_record_json");
+
+
+    }
+
+}
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java
@ -0,0 +1,73 @@
+package eu.dnetlib.dedup;
+
+import com.google.common.hash.Hashing;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.pace.config.DedupConfig;
+import eu.dnetlib.pace.model.MapDocument;
+import eu.dnetlib.pace.util.MapDocumentUtil;
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import scala.Tuple2;
+
+import java.util.List;
+
+
+/**
+ * This Spark class creates similarity relations between entities, saving result
+ *
+ * param request:
+ *  sourcePath
+ *  entityType
+ *  target Path
+ */
+public class SparkCreateSimRels {
+
+    public static void main(String[] args) throws Exception {
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
+        parser.parseArgument(args);
+        final SparkSession spark = SparkSession
+                .builder()
+                .appName(SparkCreateSimRels.class.getSimpleName())
+                .master(parser.get("master"))
+                .getOrCreate();
+
+        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+        final String inputPath = parser.get("sourcePath");
+        final String entity = parser.get("entity");
+        final String targetPath = parser.get("targetPath");
+//        final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
+        final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
+
+        final long total = sc.textFile(inputPath + "/" + entity).count();
+
+        JavaPairRDD<String, MapDocument> mapDocument = sc.textFile(inputPath + "/" + entity)
+                .mapToPair(s->{
+                    MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf,s);
+                    return new Tuple2<>(d.getIdentifier(), d);});
+
+        //create blocks for deduplication
+        JavaPairRDD<String, List<MapDocument>> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf);
+//        JavaPairRDD<String, Iterable<MapDocument>> blocks = Deduper.createBlocks(sc, mapDocument, dedupConf);
+
+        //create relations by comparing only elements in the same group
+        final JavaPairRDD<String,String> dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf);
+//        final JavaPairRDD<String,String> dedupRels = Deduper.computeRelations(sc, blocks, dedupConf);
+
+        final JavaRDD<Relation> isSimilarToRDD = dedupRels.map(simRel -> {
+            final Relation r = new Relation();
+            r.setSource(simRel._1());
+            r.setTarget(simRel._2());
+            r.setRelClass("isSimilarTo");
+            return r;
+        });
+
+        spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(targetPath,entity));
+
+    }
+}
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkReporter.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkReporter.java
@ -0,0 +1,47 @@
+package eu.dnetlib.dedup;
+
+import eu.dnetlib.pace.util.Reporter;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.spark.util.LongAccumulator;
+import scala.Serializable;
+import scala.Tuple2;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+public class SparkReporter implements Serializable, Reporter {
+
+    final List<Tuple2<String, String>> relations = new ArrayList<>();
+    private static final Log log = LogFactory.getLog(SparkReporter.class);
+    Map<String, LongAccumulator> accumulators;
+
+    public SparkReporter(Map<String, LongAccumulator> accumulators){
+        this.accumulators = accumulators;
+    }
+
+    public void incrementCounter(String counterGroup, String counterName, long delta, Map<String, LongAccumulator> accumulators) {
+
+        final String accumulatorName = String.format("%s::%s", counterGroup, counterName);
+        if (accumulators.containsKey(accumulatorName)){
+            accumulators.get(accumulatorName).add(delta);
+        }
+
+    }
+
+    @Override
+    public void incrementCounter(String counterGroup, String counterName, long delta) {
+
+        incrementCounter(counterGroup, counterName, delta, accumulators);
+    }
+
+    @Override
+    public void emit(String type, String from, String to) {
+        relations.add(new Tuple2<>(from, to));
+    }
+
+    public List<Tuple2<String, String>> getRelations() {
+        return relations;
+    }
+}
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java
@ -0,0 +1,80 @@
+package eu.dnetlib.dedup.graph;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dedup.DedupUtility;
+import eu.dnetlib.pace.util.PaceException;
+import org.apache.commons.lang.StringUtils;
+import org.codehaus.jackson.annotate.JsonIgnore;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Set;
+
+public class ConnectedComponent implements Serializable {
+
+    private Set<String> docIds;
+    private String ccId;
+
+
+    public ConnectedComponent() {
+    }
+
+    public ConnectedComponent(Set<String> docIds) {
+        this.docIds = docIds;
+        createID();
+    }
+
+    public String createID() {
+        if (docIds.size() > 1) {
+            final String s = getMin();
+            String prefix = s.split("\\|")[0];
+            ccId =prefix + "|dedup_______::" + DedupUtility.md5(s);
+            return ccId;
+        } else {
+            return docIds.iterator().next();
+        }
+    }
+
+    @JsonIgnore
+    public String getMin(){
+
+        final StringBuilder min = new StringBuilder();
+        docIds.forEach(i -> {
+            if (StringUtils.isBlank(min.toString())) {
+                min.append(i);
+            } else {
+                if (min.toString().compareTo(i) > 0) {
+                    min.setLength(0);
+                    min.append(i);
+                }
+            }
+        });
+        return min.toString();
+    }
+
+    @Override
+    public String toString(){
+        ObjectMapper mapper = new ObjectMapper();
+        try {
+            return mapper.writeValueAsString(this);
+        } catch (IOException e) {
+            throw new PaceException("Failed to create Json: ", e);
+        }
+    }
+
+    public Set<String> getDocIds() {
+        return docIds;
+    }
+
+    public void setDocIds(Set<String> docIds) {
+        this.docIds = docIds;
+    }
+
+    public String getCcId() {
+        return ccId;
+    }
+
+    public void setCcId(String ccId) {
+        this.ccId = ccId;
+    }
+}
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala
@ -0,0 +1,37 @@
+package eu.dnetlib.dedup.graph
+
+import org.apache.spark.graphx._
+import org.apache.spark.rdd.RDD
+
+import scala.collection.JavaConversions;
+
+object GraphProcessor {
+
+  def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
+    val graph: Graph[String, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
+    val cc = graph.connectedComponents(maxIterations).vertices
+
+    val joinResult = vertexes.leftOuterJoin(cc).map {
+      case (id, (openaireId, cc)) => {
+        if (cc.isEmpty) {
+          (id, openaireId)
+        }
+        else {
+          (cc.get, openaireId)
+        }
+      }
+    }
+    val connectedComponents = joinResult.groupByKey()
+      .map[ConnectedComponent](cc => asConnectedComponent(cc))
+    connectedComponents
+  }
+
+
+
+  def asConnectedComponent(group: (VertexId, Iterable[String])): ConnectedComponent = {
+    val docs = group._2.toSet[String]
+    val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs));
+    connectedComponent
+  }
+
+}
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json
@ -0,0 +1,33 @@
+[
+  {
+    "paramName": "mt",
+    "paramLongName": "master",
+    "paramDescription": "should be local or yarn",
+    "paramRequired": true
+  },
+  {
+    "paramName": "s",
+    "paramLongName": "sourcePath",
+    "paramDescription": "the path of the sequential file to read",
+    "paramRequired": true
+  },
+  {
+    "paramName": "e",
+    "paramLongName": "entity",
+    "paramDescription": "the type of entity to be deduped",
+    "paramRequired": true
+  },
+  {
+    "paramName": "c",
+    "paramLongName": "dedupConf",
+    "paramDescription": "dedup configuration to be used",
+    "compressed": true,
+    "paramRequired": true
+  },
+  {
+    "paramName": "d",
+    "paramLongName": "dedupPath",
+    "paramDescription": "dedup path to load mergeRelation",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json
@ -0,0 +1,33 @@
+[
+  {
+    "paramName": "mt",
+    "paramLongName": "master",
+    "paramDescription": "should be local or yarn",
+    "paramRequired": true
+  },
+  {
+    "paramName": "s",
+    "paramLongName": "sourcePath",
+    "paramDescription": "the path of the sequential file to read",
+    "paramRequired": true
+  },
+  {
+    "paramName": "e",
+    "paramLongName": "entity",
+    "paramDescription": "the type of entity to be deduped",
+    "paramRequired": true
+  },
+  {
+    "paramName": "c",
+    "paramLongName": "dedupConf",
+    "paramDescription": "dedup configuration to be used",
+    "compressed": true,
+    "paramRequired": true
+  },
+  {
+    "paramName": "t",
+    "paramLongName": "targetPath",
+    "paramDescription": "target path to save dedup result",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml
@ -0,0 +1,26 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>hive_metastore_uris</name>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
+    </property>
+    <property>
+        <name>hive_db_name</name>
+        <value>openaire</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml
@ -0,0 +1,126 @@
+<workflow-app name="Dedup Entities" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>sourcePath</name>
+            <description>the source path</description>
+        </property>
+        <property>
+            <name>entity</name>
+            <description>the entity that should be processed</description>
+        </property>
+        <property>
+            <name>dedupConf</name>
+            <description>the dedup Configuration</description>
+        </property>
+        <property>
+            <name>targetPath</name>
+            <description>the target path</description>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+    </parameters>
+
+    <start to="CreateSimRels"/>
+
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+<!--    <action name="DeleteTargetPath">-->
+<!--        <fs>-->
+<!--            <delete path='${targetPath}/${entity}_simrel'/>-->
+<!--            <delete path='${targetPath}/${entity}_mergeRels'/>-->
+<!--        </fs>-->
+<!--        <ok to="CreateSimRels"/>-->
+<!--        <error to="Kill"/>-->
+<!--    </action>-->
+
+    <action name="CreateSimRels">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Create Similarity Relations</name>
+            <class>eu.dnetlib.dedup.SparkCreateSimRels</class>
+            <jar>dhp-dedup-${projectVersion}.jar</jar>
+            <spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory} --conf
+                spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
+                spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
+                spark.sql.warehouse.dir="/user/hive/warehouse"
+            </spark-opts>
+            <arg>-mt</arg><arg>yarn-cluster</arg>
+            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
+            <arg>--targetPath</arg><arg>${targetPath}</arg>
+            <arg>--entity</arg><arg>${entity}</arg>
+            <arg>--dedupConf</arg><arg>${dedupConf}</arg>
+        </spark>
+        <ok to="CreateConnectedComponents"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <action name="CreateConnectedComponents">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Create Connected Components</name>
+            <class>eu.dnetlib.dedup.SparkCreateConnectedComponent</class>
+            <jar>dhp-dedup-${projectVersion}.jar</jar>
+            <spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory} --conf
+                spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
+                spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
+                spark.sql.warehouse.dir="/user/hive/warehouse"
+            </spark-opts>
+            <arg>-mt</arg><arg>yarn-cluster</arg>
+            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
+            <arg>--targetPath</arg><arg>${targetPath}</arg>
+            <arg>--entity</arg><arg>${entity}</arg>
+            <arg>--dedupConf</arg><arg>${dedupConf}</arg>
+        </spark>
+        <ok to="CreateDedupRecord"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="CreateDedupRecord">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Create Dedup Record</name>
+            <class>eu.dnetlib.dedup.SparkCreateDedupRecord</class>
+            <jar>dhp-dedup-${projectVersion}.jar</jar>
+            <spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory} --conf
+                spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
+                spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
+                spark.sql.warehouse.dir="/user/hive/warehouse"
+            </spark-opts>
+            <arg>-mt</arg><arg>yarn-cluster</arg>
+            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
+            <arg>--dedupPath</arg><arg>${dedupPath}</arg>
+            <arg>--entity</arg><arg>${entity}</arg>
+            <arg>--dedupConf</arg><arg>${dedupConf}</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java
+++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java
@ -0,0 +1,61 @@
+package eu.dnetlib.dedup;
+
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import org.apache.commons.io.IOUtils;
+import org.codehaus.jackson.map.ObjectMapper;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+public class MergeAuthorTest {
+
+    List<Publication> publicationsToMerge;
+    final ObjectMapper mapper = new ObjectMapper();
+
+    @Before
+    public void setUp() throws Exception {
+        final String json = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dedup/json/authors_merge.json"));
+
+
+        publicationsToMerge = Arrays.asList(json.split("\n")).stream().map(s-> {
+            try {
+                return mapper.readValue(s, Publication.class);
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }).collect(Collectors.toList());
+
+
+
+    }
+
+
+    @Test
+    public void test() throws  Exception {
+        Publication dedup = new Publication();
+
+
+        publicationsToMerge.forEach(p-> {
+            dedup.mergeFrom(p);
+            dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(),p.getAuthor()));
+        });
+
+
+
+
+
+
+
+
+        System.out.println(mapper.writeValueAsString(dedup));
+
+
+    }
+
+
+
+}
--- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java
+++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java
@ -0,0 +1,86 @@
+package eu.dnetlib.dedup;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+
+public class SparkCreateDedupTest {
+
+    String configuration;
+    String entity = "organization";
+
+    @Before
+    public void setUp() throws IOException {
+        configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json"));
+
+    }
+
+    @Test
+    @Ignore
+    public void createSimRelsTest() throws Exception {
+        SparkCreateSimRels.main(new String[] {
+                "-mt", "local[*]",
+                "-s", "/Users/miconis/dumps",
+                "-e", entity,
+                "-c", ArgumentApplicationParser.compressArgument(configuration),
+                "-t", "/tmp/dedup",
+        });
+    }
+
+    @Test
+    @Ignore
+    public void createCCTest() throws Exception {
+
+        SparkCreateConnectedComponent.main(new String[] {
+                "-mt", "local[*]",
+                "-s", "/Users/miconis/dumps",
+                "-e", entity,
+                "-c", ArgumentApplicationParser.compressArgument(configuration),
+                "-t", "/tmp/dedup",
+        });
+    }
+
+    @Test
+    @Ignore
+    public void dedupRecordTest() throws Exception {
+        SparkCreateDedupRecord.main(new String[] {
+                "-mt", "local[*]",
+                "-s", "/Users/miconis/dumps",
+                "-e", entity,
+                "-c", ArgumentApplicationParser.compressArgument(configuration),
+                "-d", "/tmp/dedup",
+        });
+    }
+
+    @Test
+    public void printConfiguration() throws Exception {
+        System.out.println(ArgumentApplicationParser.compressArgument(configuration));
+    }
+
+    @Test
+    public void testHashCode() {
+        final String s1 = "20|grid________::6031f94bef015a37783268ec1e75f17f";
+        final String s2 = "20|nsf_________::b12be9edf414df8ee66b4c52a2d8da46";
+
+        final HashFunction hashFunction = Hashing.murmur3_128();
+
+        System.out.println( s1.hashCode());
+        System.out.println(hashFunction.hashUnencodedChars(s1).asLong());
+        System.out.println( s2.hashCode());
+        System.out.println(hashFunction.hashUnencodedChars(s2).asLong());
+
+    }
+
+
+}
--- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/jpath/JsonPathTest.java
+++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/jpath/JsonPathTest.java
@ -0,0 +1,31 @@
+package eu.dnetlib.dedup.jpath;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.jayway.jsonpath.JsonPath;
+import org.apache.commons.io.IOUtils;
+import org.junit.Test;
+import java.util.List;
+import java.util.Map;
+
+public class JsonPathTest {
+
+    @Test
+    public void testJPath () throws  Exception {
+        final String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/sample.json"));
+        List<Map<String, Object>> pid = JsonPath.read(json, "$.pid[*]");
+//        System.out.println(json);
+
+        pid.forEach(it -> {
+            try {
+                System.out.println(new ObjectMapper().writeValueAsString(it));
+            } catch (JsonProcessingException e) {
+                e.printStackTrace();
+            }
+        });
+
+
+
+
+    }
+}
--- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json
+++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json
@ -0,0 +1,266 @@
+{
+  "wf" : {
+    "threshold" : "0.99",
+    "dedupRun" : "001",
+    "entityType" : "organization",
+    "orderField" : "legalname",
+    "queueMaxSize" : "2000",
+    "groupMaxSize" : "50",
+    "slidingWindowSize" : "200",
+    "idPath":"$.id",
+    "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
+    "includeChildren" : "true",
+    "maxIterations": "20"
+  },
+  "pace" : {
+    "clustering" : [
+      { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
+      { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
+      { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
+      { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
+    ],
+    "decisionTree" : {
+      "start": {
+        "fields": [
+          {
+            "field": "gridid",
+            "comparator": "exactMatch",
+            "weight": 1,
+            "countIfUndefined": "false",
+            "params": {}
+          }
+        ],
+        "threshold": 1,
+        "aggregation": "AVG",
+        "positive": "MATCH",
+        "negative": "NO_MATCH",
+        "undefined": "layer2",
+        "ignoreUndefined": "false"
+      },
+      "layer2": {
+        "fields": [
+          {
+            "field": "websiteurl",
+            "comparator": "domainExactMatch",
+            "weight": 1,
+            "countIfUndefined": "false",
+            "params": {}
+          },
+          {
+            "field": "country",
+            "comparator": "exactMatch",
+            "weight": 1,
+            "countIfUndefined": "true",
+            "params": {}
+          },
+          {
+            "field": "legalname",
+            "comparator": "numbersMatch",
+            "weight": 1,
+            "countIfUndefined": "true",
+            "params": {}
+          },
+          {
+            "field": "legalname",
+            "comparator": "romansMatch",
+            "weight": 1,
+            "countIfUndefined": "true",
+            "params": {}
+          }
+        ],
+        "threshold": 1,
+        "aggregation": "AND",
+        "positive": "layer3",
+        "negative": "NO_MATCH",
+        "undefined": "layer3",
+        "ignoreUndefined": "true"
+      },
+      "layer3": {
+        "fields": [
+          {
+            "field": "legalname",
+            "comparator": "cityMatch",
+            "weight": 1.0,
+            "countIfUndefined": "true",
+            "params": {
+              "windowSize": "4"
+            }
+          }
+        ],
+        "threshold": 0.7,
+        "aggregation": "W_MEAN",
+        "positive": "layer4",
+        "negative": "NO_MATCH",
+        "undefined": "NO_MATCH",
+        "ignoreUndefined": "true"
+      },
+      "layer4": {
+        "fields": [
+          {
+            "field": "legalname",
+            "comparator": "keywordMatch",
+            "weight": 1.0,
+            "countIfUndefined": "true",
+            "params": {
+              "windowSize": "4"
+            }
+          }
+        ],
+        "threshold": 0.9,
+        "aggregation": "AVG",
+        "positive": "layer5",
+        "negative": "NO_MATCH",
+        "undefined": "layer5",
+        "ignoreUndefined": "true"
+      },
+      "layer5": {
+        "fields": [
+          {
+            "field": "legalname",
+            "comparator": "jaroWinklerNormalizedName",
+            "weight": 0.9,
+            "countIfUndefined": "true",
+            "params": {
+              "windowSize": "4"
+            }
+          },
+          {
+            "field": "legalshortname",
+            "comparator": "jaroWinklerNormalizedName",
+            "weight": 0.1,
+            "countIfUndefined": "false",
+            "params": {}
+          }
+        ],
+        "threshold": 0.9,
+        "aggregation": "W_MEAN",
+        "positive": "MATCH",
+        "negative": "NO_MATCH",
+        "undefined": "NO_MATCH",
+        "ignoreUndefined": "true"
+      }
+    },
+    "model" : [
+      { "name" : "country", "type" : "String", "path" : "$.country.classid"},
+      { "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
+      { "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
+      { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
+      { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid.ac')].value"},
+      { "name" : "originalId", "type" : "String", "path" : "$.id" }
+    ],
+    "blacklists" : {
+      "legalname" : []
+    },
+    "synonyms": {
+      "key::1": ["university","università","università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
+      "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
+      "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
+      "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
+      "key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],
+      "key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],
+      "key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"],
+      "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
+      "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
+      "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
+      "key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
+      "key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
+      "key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
+      "key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],
+      "key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],
+      "key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],
+      "key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],
+      "key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],
+      "key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],
+      "key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],
+      "key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],
+      "key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],
+      "key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],
+      "key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],
+      "key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],
+      "key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],
+      "key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],
+      "key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],
+      "key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],
+      "key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],
+      "key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],
+      "key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"],
+      "key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],
+      "key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],
+      "key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],
+      "key::36": ["authority","autorità","autorité","авторитет","autoriteit"],
+      "key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],
+      "key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],
+      "key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"],
+      "key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],
+      "key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],
+      "key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],
+      "key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],
+      "key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],
+      "key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],
+      "key::46": ["division","divisione","division","отделение","divisie","τμήμα"],
+      "key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"],
+      "key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
+      "key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
+      "key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
+      "key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""],
+      "key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""],
+      "key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""],
+      "key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""],
+      "key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""],
+      "key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""],
+      "key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""],
+      "key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""],
+      "key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""],
+      "key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""],
+      "key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""],
+      "key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""],
+      "key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""],
+      "key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""],
+      "key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""],
+      "key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""],
+      "key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""],
+      "key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""],
+      "key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""],
+      "key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""],
+      "key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""],
+      "key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""],
+      "key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""],
+      "key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""],
+      "key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""],
+      "key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""],
+      "key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""],
+      "key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""],
+      "key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""],
+      "key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""],
+      "key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""],
+      "key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""],
+      "key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""],
+      "key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""],
+      "key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""],
+      "key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""],
+      "key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""],
+      "key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""],
+      "key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""],
+      "key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""],
+      "key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""],
+      "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
+      "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
+      "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
+      "key::95": ["mechanics", "mechanical", "meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
+      "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
+      "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
+      "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
+      "key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""],
+      "key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""],
+      "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""],
+      "key::102": ["informatics","informatica","informática","informática","informatica",""],
+      "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
+      "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"],
+      "key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"],
+      "key::106" : ["seminary", "seminario", "seminaire", "seminar"],
+      "key::107" : ["agricultural forestry", "af", "a f"],
+      "key::108" : ["agricultural mechanical", "am", "a m"],
+      "key::109" : ["catholic", "catholique", "katholische", "catolica", "cattolica", "catolico"]
+    }
+  }
+}
--- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json
+++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json
@ -0,0 +1,280 @@
+{
+  "wf" : {
+    "threshold" : "0.99",
+    "dedupRun" : "001",
+    "entityType" : "result",
+    "subEntityType" : "resulttype",
+    "subEntityValue" : "publication",
+    "orderField" : "title",
+    "queueMaxSize" : "2000",
+    "groupMaxSize" : "100",
+    "maxChildren" : "100",
+    "idPath": "$.id",
+    "slidingWindowSize" : "200",
+    "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_isAffiliatedWith", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
+    "includeChildren" : "true"
+  },
+  "pace" : {
+    "clustering" : [
+      { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
+      { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
+      { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
+    ],
+    "strictConditions" : [
+      { "name" : "pidMatch", "fields" : [ "pid" ] }
+    ],
+    "conditions" : [
+      { "name" : "titleVersionMatch", "fields" : [ "title" ] },
+      { "name" : "sizeMatch", "fields" : [ "authors" ] }
+    ],
+    "model" : [
+      { "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.pid[?(@.qualifier.classid ==\"doi\")].value" },
+      { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.pid", "overrideMatch" : "true" },
+      { "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "$.title[?(@.qualifier.classid ==\"main title\")].value", "length" : 250, "size" : 5 },
+      { "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.author[*].fullname", "size" : 200 },
+      { "name" : "resulttype", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "false", "path" : "$.resulttype.classid" }
+    ],
+    "synonyms": {},
+    "blacklists" : {
+      "title" : [
+        "^Inside Front Cover$",
+        "(?i)^Poster presentations$",
+        "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
+        "^Problems with perinatal pathology\\.?$",
+        "(?i)^Cases? of Puerperal Convulsions$",
+        "(?i)^Operative Gyna?ecology$",
+        "(?i)^Mind the gap\\!?\\:?$",
+        "^Chronic fatigue syndrome\\.?$",
+        "^Cartas? ao editor Letters? to the Editor$",
+        "^Note from the Editor$",
+        "^Anesthesia Abstract$",
+
+        "^Annual report$",
+        "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
+        "(?i)^Graph and Table of Infectious Diseases?$",
+        "^Presentation$",
+        "(?i)^Reviews and Information on Publications$",
+        "(?i)^PUBLIC HEALTH SERVICES?$",
+        "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
+        "(?i)^Adrese autora$",
+        "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
+        "(?i)^Acknowledgement to Referees$",
+        "(?i)^Behçet's disease\\.?$",
+        "(?i)^Isolation and identification of restriction endonuclease.*$",
+        "(?i)^CEREBROVASCULAR DISEASES?.?$",
+        "(?i)^Screening for abdominal aortic aneurysms?\\.?$",
+        "^Event management$",
+        "(?i)^Breakfast and Crohn's disease.*\\.?$",
+        "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
+        "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
+        "^Gushi hakubutsugaku$",
+
+        "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
+        "^Intestinal spirocha?etosis$",
+        "^Treatment of Rodent Ulcer$",
+        "(?i)^\\W*Cloud Computing\\W*$",
+        "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
+        "^Free Communications, Poster Presentations: Session [A-F]$",
+
+        "^“The Historical Aspects? of Quackery\\.?”$",
+        "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
+        "^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
+        "(?i)^Case Report$",
+        "^Boletín Informativo$",
+        "(?i)^Glioblastoma Multiforme$",
+        "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
+        "^Zaměstnanecké výhody$",
+        "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
+        "(?i)^Carotid body tumours?\\.?$",
+        "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
+        "^Avant-propos$",
+        "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
+        "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
+        "(?i)^PUBLIC HEALTH VERSUS THE STATE$",
+        "^Viñetas de Cortázar$",
+        "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
+        "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
+        "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
+        "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
+
+        "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
+        "^Aus der AGMB$",
+
+        "^Znanstveno-stručni prilozi$",
+        "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
+        "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
+        "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
+        "^Finanční analýza podniku$",
+        "^Financial analysis( of business)?$",
+        "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
+        "^Jikken nihon shūshinsho$",
+        "(?i)^CORONER('|s)(s|') INQUESTS$",
+        "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
+        "(?i)^Consultants' contract(s)?$",
+        "(?i)^Upute autorima$",
+        "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
+        "^Joshi shin kokubun$",
+        "^Kōtō shōgaku dokuhon nōson'yō$",
+        "^Jinjō shōgaku shōka$",
+        "^Shōgaku shūjichō$",
+        "^Nihon joshi dokuhon$",
+        "^Joshi shin dokuhon$",
+        "^Chūtō kanbun dokuhon$",
+        "^Wabun dokuhon$",
+        "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
+        "(?i)^cardiac rehabilitation$",
+        "(?i)^Analytical summary$",
+        "^Thesaurus resolutionum Sacrae Congregationis Concilii$",
+        "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
+        "^Prikazi i osvrti$",
+        "^Rodinný dům s provozovnou$",
+        "^Family house with an establishment$",
+        "^Shinsei chūtō shin kokugun$",
+        "^Pulmonary alveolar proteinosis(\\.?)$",
+        "^Shinshū kanbun$",
+        "^Viñeta(s?) de Rodríguez$",
+        "(?i)^RUBRIKA UREDNIKA$",
+        "^A Matching Model of the Academic Publication Market$",
+        "^Yōgaku kōyō$",
+
+        "^Internetový marketing$",
+        "^Internet marketing$",
+        "^Chūtō kokugo dokuhon$",
+        "^Kokugo dokuhon$",
+        "^Antibiotic Cover for Dental Extraction(s?)$",
+        "^Strategie podniku$",
+        "^Strategy of an Enterprise$",
+        "(?i)^respiratory disease(s?)(\\.?)$",
+        "^Award(s?) for Gallantry in Civil Defence$",
+        "^Podniková kultura$",
+        "^Corporate Culture$",
+        "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
+        "^Pracovní motivace$",
+        "^Work Motivation$",
+        "^Kaitei kōtō jogaku dokuhon$",
+        "^Konsolidovaná účetní závěrka$",
+        "^Consolidated Financial Statements$",
+        "(?i)^intracranial tumour(s?)$",
+        "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
+        "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
+        "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
+        "^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
+        "^Úroveň motivačního procesu jako způsobu vedení lidí$",
+        "^The level of motivation process as a leadership$",
+        "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
+        "(?i)^news and events$",
+        "(?i)^NOVOSTI I DOGAĐAJI$",
+        "^Sansū no gakushū$",
+        "^Posouzení informačního systému firmy a návrh změn$",
+        "^Information System Assessment and Proposal for ICT Modification$",
+        "^Stresové zatížení pracovníků ve vybrané profesi$",
+        "^Stress load in a specific job$",
+
+        "^Sunday: Poster Sessions, Pt.*$",
+        "^Monday: Poster Sessions, Pt.*$",
+        "^Wednesday: Poster Sessions, Pt.*",
+        "^Tuesday: Poster Sessions, Pt.*$",
+
+        "^Analýza reklamy$",
+        "^Analysis of advertising$",
+
+        "^Shōgaku shūshinsho$",
+        "^Shōgaku sansū$",
+        "^Shintei joshi kokubun$",
+        "^Taishō joshi kokubun dokuhon$",
+        "^Joshi kokubun$",
+
+        "^Účetní uzávěrka a účetní závěrka v ČR$",
+        "(?i)^The \"?Causes\"? of Cancer$",
+        "^Normas para la publicación de artículos$",
+        "^Editor('|s)(s|') [Rr]eply$",
+        "^Editor(’|s)(s|’) letter$",
+        "^Redaktoriaus žodis$",
+        "^DISCUSSION ON THE PRECEDING PAPER$",
+        "^Kōtō shōgaku shūshinsho jidōyō$",
+        "^Shōgaku nihon rekishi$",
+        "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
+        "^Préface$",
+        "^Occupational [Hh]ealth [Ss]ervices.$",
+        "^In Memoriam Professor Toshiyuki TAKESHIMA$",
+        "^Účetní závěrka ve vybraném podniku.*$",
+        "^Financial statements in selected company$",
+        "^Abdominal [Aa]ortic [Aa]neurysms.*$",
+        "^Pseudomyxoma peritonei$",
+        "^Kazalo autora$",
+
+        "(?i)^uvodna riječ$",
+        "^Motivace jako způsob vedení lidí$",
+        "^Motivation as a leadership$",
+        "^Polyfunkční dům$",
+        "^Multi\\-funkcional building$",
+        "^Podnikatelský plán$",
+        "(?i)^Podnikatelský záměr$",
+        "(?i)^Business Plan$",
+        "^Oceňování nemovitostí$",
+        "^Marketingová komunikace$",
+        "^Marketing communication$",
+        "^Sumario Analítico$",
+        "^Riječ uredništva$",
+        "^Savjetovanja i priredbe$",
+        "^Índice$",
+        "^(Starobosanski nadpisi).*$",
+        "^Vzdělávání pracovníků v organizaci$",
+        "^Staff training in organization$",
+        "^(Life Histories of North American Geometridae).*$",
+        "^Strategická analýza podniku$",
+        "^Strategic Analysis of an Enterprise$",
+        "^Sadržaj$",
+        "^Upute suradnicima$",
+        "^Rodinný dům$",
+        "(?i)^Fami(l)?ly house$",
+        "^Upute autorima$",
+        "^Strategic Analysis$",
+        "^Finanční analýza vybraného podniku$",
+        "^Finanční analýza$",
+        "^Riječ urednika$",
+        "(?i)^Content(s?)$",
+        "(?i)^Inhalt$",
+        "^Jinjō shōgaku shūshinsho jidōyō$",
+        "(?i)^Index$",
+        "^Chūgaku kokubun kyōkasho$",
+        "^Retrato de una mujer$",
+        "^Retrato de un hombre$",
+        "^Kōtō shōgaku dokuhon$",
+        "^Shotōka kokugo$",
+        "^Shōgaku dokuhon$",
+        "^Jinjō shōgaku kokugo dokuhon$",
+        "^Shinsei kokugo dokuhon$",
+        "^Teikoku dokuhon$",
+        "^Instructions to Authors$",
+        "^KİTAP TAHLİLİ$",
+        "^PRZEGLĄD PIŚMIENNICTWA$",
+        "(?i)^Presentación$",
+        "^İçindekiler$",
+        "(?i)^Tabl?e of contents$",
+        "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
+        "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
+        "^Editorial( Board)?$",
+        "(?i)^Editorial \\(English\\)$",
+        "^Editörden$",
+        "^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
+        "^(Kiri Karl Morgensternile).*$",
+        "^(\\[Eksliibris Aleksandr).*\\]$",
+        "^(\\[Eksliibris Aleksandr).*$",
+        "^(Eksliibris Aleksandr).*$",
+        "^(Kiri A\\. de Vignolles).*$",
+        "^(2 kirja Karl Morgensternile).*$",
+        "^(Pirita kloostri idaosa arheoloogilised).*$",
+        "^(Kiri tundmatule).*$",
+        "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
+        "^(Eksliibris Nikolai Birukovile).*$",
+        "^(Eksliibris Nikolai Issakovile).*$",
+        "^(WHP Cruise Summary Information of section).*$",
+        "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
+        "^(Measurement of the spin\\-dependent structure function).*",
+        "(?i)^.*authors['’′]? reply\\.?$",
+        "(?i)^.*authors['’′]? response\\.?$"
+      ]
+    }
+  }
+}
--- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json
+++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json
@ -0,0 +1,386 @@
+{
+  "wf": {
+    "threshold": "0.99",
+    "dedupRun": "001",
+    "entityType": "result",
+    "subEntityType": "resulttype",
+    "subEntityValue": "publication",
+    "orderField": "title",
+    "queueMaxSize": "2000",
+    "groupMaxSize": "100",
+    "maxChildren": "100",
+    "slidingWindowSize": "200",
+    "rootBuilder": [
+      "result",
+      "resultProject_outcome_isProducedBy",
+      "resultResult_publicationDataset_isRelatedTo",
+      "resultResult_similarity_isAmongTopNSimilarDocuments",
+      "resultResult_similarity_hasAmongTopNSimilarDocuments",
+      "resultOrganization_affiliation_isAffiliatedWith",
+      "resultResult_part_hasPart",
+      "resultResult_part_isPartOf",
+      "resultResult_supplement_isSupplementTo",
+      "resultResult_supplement_isSupplementedBy",
+      "resultResult_version_isVersionOf"
+    ],
+    "includeChildren": "true",
+    "maxIterations": 20,
+    "idPath": "$.id"
+  },
+  "pace": {
+    "clustering": [
+      {
+        "name": "ngrampairs",
+        "fields": [
+          "title"
+        ],
+        "params": {
+          "max": "1",
+          "ngramLen": "3"
+        }
+      },
+      {
+        "name": "suffixprefix",
+        "fields": [
+          "title"
+        ],
+        "params": {
+          "max": "1",
+          "len": "3"
+        }
+      },
+      {
+        "name": "lowercase",
+        "fields": [
+          "doi"
+        ],
+        "params": {}
+      }
+    ],
+    "decisionTree": {
+      "start": {
+        "fields": [
+          {
+            "field": "pid",
+            "comparator": "jsonListMatch",
+            "weight": 1.0,
+            "countIfUndefined": "false",
+            "params": {
+              "jpath_value": "$.value",
+              "jpath_classid": "$.qualifier.classid"
+            }
+          }
+        ],
+        "threshold": 0.5,
+        "aggregation": "AVG",
+        "positive": "MATCH",
+        "negative": "layer2",
+        "undefined": "layer2",
+        "ignoreUndefined": "true"
+      },
+      "layer2": {
+        "fields": [
+          {
+            "field": "title",
+            "comparator": "titleVersionMatch",
+            "weight": 1.0,
+            "countIfUndefined": "false",
+            "params": {}
+          },
+          {
+            "field": "authors",
+            "comparator": "sizeMatch",
+            "weight": 1.0,
+            "countIfUndefined": "false",
+            "params": {}
+          }
+        ],
+        "threshold": 1.0,
+        "aggregation": "AND",
+        "positive": "layer3",
+        "negative": "NO_MATCH",
+        "undefined": "layer3",
+        "ignoreUndefined": "false"
+      },
+      "layer3": {
+        "fields": [
+          {
+            "field": "title",
+            "comparator": "levensteinTitle",
+            "weight": 1.0,
+            "countIfUndefined": "true",
+            "params": {}
+          }
+        ],
+        "threshold": 0.99,
+        "aggregation": "AVG",
+        "positive": "MATCH",
+        "negative": "NO_MATCH",
+        "undefined": "NO_MATCH",
+        "ignoreUndefined": "true"
+      }
+    },
+    "model": [
+      {
+        "name": "doi",
+        "type": "String",
+        "path": "$.pid[?(@.qualifier.classid == 'doi')].value"
+      },
+      {
+        "name": "pid",
+        "type": "JSON",
+        "path": "$.pid",
+        "overrideMatch": "true"
+      },
+      {
+        "name": "title",
+        "type": "String",
+        "path": "$.title[?(@.qualifier.classid == 'main title')].value",
+        "length": 250,
+        "size": 5
+      },
+      {
+        "name": "authors",
+        "type": "List",
+        "path": "$.author[*].fullname",
+        "size": 200
+      },
+      {
+        "name": "resulttype",
+        "type": "String",
+        "path": "$.resulttype.classid"
+      }
+    ],
+    "blacklists": {
+      "title": [
+        "^Inside Front Cover$",
+        "(?i)^Poster presentations$",
+        "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
+        "^Problems with perinatal pathology\\.?$",
+        "(?i)^Cases? of Puerperal Convulsions$",
+        "(?i)^Operative Gyna?ecology$",
+        "(?i)^Mind the gap\\!?\\:?$",
+        "^Chronic fatigue syndrome\\.?$",
+        "^Cartas? ao editor Letters? to the Editor$",
+        "^Note from the Editor$",
+        "^Anesthesia Abstract$",
+        "^Annual report$",
+        "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
+        "(?i)^Graph and Table of Infectious Diseases?$",
+        "^Presentation$",
+        "(?i)^Reviews and Information on Publications$",
+        "(?i)^PUBLIC HEALTH SERVICES?$",
+        "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
+        "(?i)^Adrese autora$",
+        "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
+        "(?i)^Acknowledgement to Referees$",
+        "(?i)^Behçet's disease\\.?$",
+        "(?i)^Isolation and identification of restriction endonuclease.*$",
+        "(?i)^CEREBROVASCULAR DISEASES?.?$",
+        "(?i)^Screening for abdominal aortic aneurysms?\\.?$",
+        "^Event management$",
+        "(?i)^Breakfast and Crohn's disease.*\\.?$",
+        "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
+        "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
+        "^Gushi hakubutsugaku$",
+        "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
+        "^Intestinal spirocha?etosis$",
+        "^Treatment of Rodent Ulcer$",
+        "(?i)^\\W*Cloud Computing\\W*$",
+        "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
+        "^Free Communications, Poster Presentations: Session [A-F]$",
+        "^“The Historical Aspects? of Quackery\\.?”$",
+        "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
+        "^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
+        "(?i)^Case Report$",
+        "^Boletín Informativo$",
+        "(?i)^Glioblastoma Multiforme$",
+        "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
+        "^Zaměstnanecké výhody$",
+        "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
+        "(?i)^Carotid body tumours?\\.?$",
+        "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
+        "^Avant-propos$",
+        "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
+        "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
+        "(?i)^PUBLIC HEALTH VERSUS THE STATE$",
+        "^Viñetas de Cortázar$",
+        "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
+        "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
+        "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
+        "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
+        "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
+        "^Aus der AGMB$",
+        "^Znanstveno-stručni prilozi$",
+        "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
+        "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
+        "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
+        "^Finanční analýza podniku$",
+        "^Financial analysis( of business)?$",
+        "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
+        "^Jikken nihon shūshinsho$",
+        "(?i)^CORONER('|s)(s|') INQUESTS$",
+        "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
+        "(?i)^Consultants' contract(s)?$",
+        "(?i)^Upute autorima$",
+        "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
+        "^Joshi shin kokubun$",
+        "^Kōtō shōgaku dokuhon nōson'yō$",
+        "^Jinjō shōgaku shōka$",
+        "^Shōgaku shūjichō$",
+        "^Nihon joshi dokuhon$",
+        "^Joshi shin dokuhon$",
+        "^Chūtō kanbun dokuhon$",
+        "^Wabun dokuhon$",
+        "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
+        "(?i)^cardiac rehabilitation$",
+        "(?i)^Analytical summary$",
+        "^Thesaurus resolutionum Sacrae Congregationis Concilii$",
+        "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
+        "^Prikazi i osvrti$",
+        "^Rodinný dům s provozovnou$",
+        "^Family house with an establishment$",
+        "^Shinsei chūtō shin kokugun$",
+        "^Pulmonary alveolar proteinosis(\\.?)$",
+        "^Shinshū kanbun$",
+        "^Viñeta(s?) de Rodríguez$",
+        "(?i)^RUBRIKA UREDNIKA$",
+        "^A Matching Model of the Academic Publication Market$",
+        "^Yōgaku kōyō$",
+        "^Internetový marketing$",
+        "^Internet marketing$",
+        "^Chūtō kokugo dokuhon$",
+        "^Kokugo dokuhon$",
+        "^Antibiotic Cover for Dental Extraction(s?)$",
+        "^Strategie podniku$",
+        "^Strategy of an Enterprise$",
+        "(?i)^respiratory disease(s?)(\\.?)$",
+        "^Award(s?) for Gallantry in Civil Defence$",
+        "^Podniková kultura$",
+        "^Corporate Culture$",
+        "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
+        "^Pracovní motivace$",
+        "^Work Motivation$",
+        "^Kaitei kōtō jogaku dokuhon$",
+        "^Konsolidovaná účetní závěrka$",
+        "^Consolidated Financial Statements$",
+        "(?i)^intracranial tumour(s?)$",
+        "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
+        "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
+        "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
+        "^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
+        "^Úroveň motivačního procesu jako způsobu vedení lidí$",
+        "^The level of motivation process as a leadership$",
+        "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
+        "(?i)^news and events$",
+        "(?i)^NOVOSTI I DOGAĐAJI$",
+        "^Sansū no gakushū$",
+        "^Posouzení informačního systému firmy a návrh změn$",
+        "^Information System Assessment and Proposal for ICT Modification$",
+        "^Stresové zatížení pracovníků ve vybrané profesi$",
+        "^Stress load in a specific job$",
+        "^Sunday: Poster Sessions, Pt.*$",
+        "^Monday: Poster Sessions, Pt.*$",
+        "^Wednesday: Poster Sessions, Pt.*",
+        "^Tuesday: Poster Sessions, Pt.*$",
+        "^Analýza reklamy$",
+        "^Analysis of advertising$",
+        "^Shōgaku shūshinsho$",
+        "^Shōgaku sansū$",
+        "^Shintei joshi kokubun$",
+        "^Taishō joshi kokubun dokuhon$",
+        "^Joshi kokubun$",
+        "^Účetní uzávěrka a účetní závěrka v ČR$",
+        "(?i)^The \"?Causes\"? of Cancer$",
+        "^Normas para la publicación de artículos$",
+        "^Editor('|s)(s|') [Rr]eply$",
+        "^Editor(’|s)(s|’) letter$",
+        "^Redaktoriaus žodis$",
+        "^DISCUSSION ON THE PRECEDING PAPER$",
+        "^Kōtō shōgaku shūshinsho jidōyō$",
+        "^Shōgaku nihon rekishi$",
+        "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
+        "^Préface$",
+        "^Occupational [Hh]ealth [Ss]ervices.$",
+        "^In Memoriam Professor Toshiyuki TAKESHIMA$",
+        "^Účetní závěrka ve vybraném podniku.*$",
+        "^Financial statements in selected company$",
+        "^Abdominal [Aa]ortic [Aa]neurysms.*$",
+        "^Pseudomyxoma peritonei$",
+        "^Kazalo autora$",
+        "(?i)^uvodna riječ$",
+        "^Motivace jako způsob vedení lidí$",
+        "^Motivation as a leadership$",
+        "^Polyfunkční dům$",
+        "^Multi\\-funkcional building$",
+        "^Podnikatelský plán$",
+        "(?i)^Podnikatelský záměr$",
+        "(?i)^Business Plan$",
+        "^Oceňování nemovitostí$",
+        "^Marketingová komunikace$",
+        "^Marketing communication$",
+        "^Sumario Analítico$",
+        "^Riječ uredništva$",
+        "^Savjetovanja i priredbe$",
+        "^Índice$",
+        "^(Starobosanski nadpisi).*$",
+        "^Vzdělávání pracovníků v organizaci$",
+        "^Staff training in organization$",
+        "^(Life Histories of North American Geometridae).*$",
+        "^Strategická analýza podniku$",
+        "^Strategic Analysis of an Enterprise$",
+        "^Sadržaj$",
+        "^Upute suradnicima$",
+        "^Rodinný dům$",
+        "(?i)^Fami(l)?ly house$",
+        "^Upute autorima$",
+        "^Strategic Analysis$",
+        "^Finanční analýza vybraného podniku$",
+        "^Finanční analýza$",
+        "^Riječ urednika$",
+        "(?i)^Content(s?)$",
+        "(?i)^Inhalt$",
+        "^Jinjō shōgaku shūshinsho jidōyō$",
+        "(?i)^Index$",
+        "^Chūgaku kokubun kyōkasho$",
+        "^Retrato de una mujer$",
+        "^Retrato de un hombre$",
+        "^Kōtō shōgaku dokuhon$",
+        "^Shotōka kokugo$",
+        "^Shōgaku dokuhon$",
+        "^Jinjō shōgaku kokugo dokuhon$",
+        "^Shinsei kokugo dokuhon$",
+        "^Teikoku dokuhon$",
+        "^Instructions to Authors$",
+        "^KİTAP TAHLİLİ$",
+        "^PRZEGLĄD PIŚMIENNICTWA$",
+        "(?i)^Presentación$",
+        "^İçindekiler$",
+        "(?i)^Tabl?e of contents$",
+        "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
+        "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
+        "^Editorial( Board)?$",
+        "(?i)^Editorial \\(English\\)$",
+        "^Editörden$",
+        "^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
+        "^(Kiri Karl Morgensternile).*$",
+        "^(\\[Eksliibris Aleksandr).*\\]$",
+        "^(\\[Eksliibris Aleksandr).*$",
+        "^(Eksliibris Aleksandr).*$",
+        "^(Kiri A\\. de Vignolles).*$",
+        "^(2 kirja Karl Morgensternile).*$",
+        "^(Pirita kloostri idaosa arheoloogilised).*$",
+        "^(Kiri tundmatule).*$",
+        "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
+        "^(Eksliibris Nikolai Birukovile).*$",
+        "^(Eksliibris Nikolai Issakovile).*$",
+        "^(WHP Cruise Summary Information of section).*$",
+        "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
+        "^(Measurement of the spin\\-dependent structure function).*",
+        "(?i)^.*authors['’′]? reply\\.?$",
+        "(?i)^.*authors['’′]? response\\.?$"
+      ]
+    },
+    "synonyms": {}
+  }
+}
--- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/sample.json
+++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/sample.json
--- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json
+++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json
--- a/dhp-workflows/docs/oozie-installer.markdown
+++ b/dhp-workflows/docs/oozie-installer.markdown
@ -73,7 +73,7 @@ Workflow definition requirements

 This property can be set using maven `-D` switch.

-`[oozie_app]` is the default directory name however it can be set to any value as soon as `oozieAppDir` property is provided with directory name as value. 
+`[oozie_app]` is the default directory name however it can be set to any value as soon as `oozieAppDir` property is provided with directory name as value.

 Subworkflows are supported as well and subworkflow directories should be nested within `[oozie_app]` directory. 

--- a/dhp-workflows/pom.xml
+++ b/dhp-workflows/pom.xml
@ -17,6 +17,7 @@
        <module>dhp-aggregation</module>
        <module>dhp-distcp</module>
        <module>dhp-graph-mapper</module>
+        <module>dhp-dedup</module>
    </modules>

    <pluginRepositories>
@ -310,6 +311,7 @@
                        </executions>
                    </plugin>

+
                    <plugin>
                        <!-- this plugin prepares oozie installer package-->

@ -523,6 +525,7 @@
                    <plugin>
                        <groupId>org.apache.maven.plugins</groupId>
                        <artifactId>maven-failsafe-plugin</artifactId>
+                        <version>${maven.failsave.plugin.version}</version>
                        <executions>
                            <execution>
                                <id>default-integration-test</id>
--- a/pom.xml
+++ b/pom.xml
@ -114,6 +114,12 @@
                <version>${dhp.spark.version}</version>
                <scope>provided</scope>
            </dependency>
+            <dependency>
+                <groupId>org.apache.spark</groupId>
+                <artifactId>spark-graphx_2.11</artifactId>
+                <version>${dhp.spark.version}</version>
+                <scope>provided</scope>
+            </dependency>

            <dependency>
                <groupId>org.apache.commons</groupId>
@ -177,6 +183,7 @@
                <version>${dhp.jackson.version}</version>
                <scope>provided</scope>
            </dependency>
+
            <dependency>
                <groupId>com.fasterxml.jackson.core</groupId>
                <artifactId>jackson-annotations</artifactId>
@ -190,6 +197,12 @@
                <scope>provided</scope>
            </dependency>

+            <dependency>
+                <groupId>eu.dnetlib</groupId>
+                <artifactId>dnet-pace-core</artifactId>
+                <version>4.0.0-SNAPSHOT</version>
+            </dependency>
+

            <dependency>
                <groupId>javax.persistence</groupId>
@ -203,6 +216,21 @@
                <artifactId>amqp-client</artifactId>
                <version>5.6.0</version>
            </dependency>
+            <dependency>
+                <groupId>com.jayway.jsonpath</groupId>
+                <artifactId>json-path</artifactId>
+                <version>2.4.0</version>
+            </dependency>
+            <dependency>
+                <groupId>com.arakelian</groupId>
+                <artifactId>java-jq</artifactId>
+                <version>0.10.1</version>
+            </dependency>
+            <dependency>
+                <groupId>edu.cmu</groupId>
+                <artifactId>secondstring</artifactId>
+                <version>1.0.0</version>
+            </dependency>

            <dependency>
                <groupId>org.apache.oozie</groupId>
@ -230,7 +258,7 @@
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-compiler-plugin</artifactId>
-                    <version>3.6.0</version>
+                    <version>${maven.compiler.plugin.version}</version>
                    <configuration>
                        <source>1.8</source>
                        <target>1.8</target>
@ -259,27 +287,6 @@
                    </executions>
                </plugin>

-                <plugin>
-                    <groupId>eu.dnetlib</groupId>
-                    <artifactId>protoc-jar-maven-plugin</artifactId>
-                    <version>1.1.0</version>
-                    <executions>
-                        <execution>
-                            <phase>generate-sources</phase>
-                            <goals>
-                                <goal>run</goal>
-                            </goals>
-                            <configuration>
-                                <protocVersion>${google.protobuf.version}</protocVersion>
-                                <inputDirectories>
-                                    <include>src/main/resources</include>
-                                </inputDirectories>
-                                <outputDirectory>src/gen/java</outputDirectory>
-                            </configuration>
-                        </execution>
-                    </executions>
-                </plugin>
-
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-surefire-plugin</artifactId>
@ -342,6 +349,31 @@
                    </execution>
                </executions>
            </plugin>
+            <plugin>
+                <groupId>net.alchim31.maven</groupId>
+                <artifactId>scala-maven-plugin</artifactId>
+                <version>4.0.1</version>
+                <executions>
+                    <execution>
+                        <id>scala-compile-first</id>
+                        <phase>initialize</phase>
+                        <goals>
+                            <goal>add-source</goal>
+                            <goal>compile</goal>
+                        </goals>
+                    </execution>
+                    <execution>
+                        <id>scala-test-compile</id>
+                        <phase>process-test-resources</phase>
+                        <goals>
+                            <goal>testCompile</goal>
+                        </goals>
+                    </execution>
+                </executions>
+                <configuration>
+                    <scalaVersion>${scala.version}</scalaVersion>
+                </configuration>
+            </plugin>
        </plugins>

        <extensions>
@ -380,14 +412,15 @@
    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
-
+        <maven.compiler.plugin.version>3.6.0</maven.compiler.plugin.version>
+	<maven.failsave.plugin.version>2.22.2</maven.failsave.plugin.version>
        <dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
        <dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version>
        <dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
        <dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
        <dhp.jackson.version>2.9.6</dhp.jackson.version>
        <dhp.commons.lang.version>3.5</dhp.commons.lang.version>
-        <scala.version>2.11.8</scala.version>
+        <scala.version>2.11.12</scala.version>
    </properties>
 </project>