implemented DedupRecord factory with the merge of publications

2019-12-11 15:43:24 +01:00 · 2019-12-11 15:43:24 +01:00 · abd9034da0
parent 4b66b471a4
commit abd9034da0
30 changed files with 686 additions and 262 deletions
--- a/.gitignore
+++ b/.gitignore
@ -18,4 +18,5 @@
 /*/build
 /build
 spark-warehouse
-/dhp-workflows/dhp-graph-mapper/job-override.properties
+/*/*/job-override.properties
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -17,6 +17,10 @@
 			<groupId>commons-cli</groupId>
 			<artifactId>commons-cli</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>commons-io</groupId>
 			<artifactId>commons-io</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.commons</groupId>
 			<artifactId>commons-lang3</artifactId>
@ -29,21 +33,15 @@
 			<groupId>javax.persistence</groupId>
 			<artifactId>javax.persistence-api</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>com.fasterxml.jackson.core</groupId>
 			<artifactId>jackson-databind</artifactId>
 		</dependency>
 		<!-- https://mvnrepository.com/artifact/com.rabbitmq/amqp-client -->
 		<dependency>
 			<groupId>com.rabbitmq</groupId>
 			<artifactId>amqp-client</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>commons-io</groupId>
 			<artifactId>commons-io</artifactId>
 		</dependency>
 	</dependencies>
 </project>
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java
@ -2,17 +2,25 @@ package eu.dnetlib.dhp.application;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import org.apache.commons.cli.*;
 import org.apache.commons.codec.binary.Base64;
 import org.apache.commons.io.IOUtils;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.Serializable;
-import java.util.Arrays;
+import java.io.StringWriter;
-import java.util.HashMap;
+import java.util.*;
-import java.util.Map;
+import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;
 import java.util.zip.Inflater;
 public class ArgumentApplicationParser implements Serializable {
    private final Options options = new Options();
    private final Map<String, String> objectMap = new HashMap<>();
    private final List<String> compressedValues = new ArrayList<>();
    public ArgumentApplicationParser(final String json_configuration) throws Exception {
        final ObjectMapper mapper = new ObjectMapper();
        final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
@ -29,6 +37,9 @@ public class ArgumentApplicationParser implements Serializable {
            final Option o = new Option(conf.getParamName(), true, conf.getParamDescription());
            o.setLongOpt(conf.getParamLongName());
            o.setRequired(conf.isParamRequired());
            if (conf.isCompressed()) {
                compressedValues.add(conf.getParamLongName());
            }
            return o;
        }).forEach(options::addOption);
@ -38,10 +49,32 @@ public class ArgumentApplicationParser implements Serializable {
    }
    public static String decompressValue(final String abstractCompressed) {
        try {
            byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
            GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray));
            final StringWriter stringWriter = new StringWriter();
            IOUtils.copy(gis, stringWriter);
            return stringWriter.toString();
        } catch (Throwable e) {
            System.out.println("Wrong value to decompress:" + abstractCompressed);
            throw new RuntimeException(e);
        }
    }
    public static String compressArgument(final String value)  throws Exception{
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        GZIPOutputStream gzip = new GZIPOutputStream(out);
        gzip.write(value.getBytes());
        gzip.close();
        return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
    }
    public void parseArgument(final String[] args) throws Exception {
        CommandLineParser parser = new BasicParser();
        CommandLine cmd = parser.parse(options, args);
-        Arrays.stream(cmd.getOptions()).forEach(it -> objectMap.put(it.getLongOpt(), it.getValue()));
+        Arrays.stream(cmd.getOptions()).forEach(it -> objectMap.put(it.getLongOpt(), compressedValues.contains(it.getLongOpt())? decompressValue(it.getValue()): it.getValue()));
    }
    public String get(final String key) {
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java
@ -7,6 +7,7 @@ public class OptionsParameter {
    private String paramLongName;
    private String paramDescription;
    private boolean paramRequired;
    private boolean compressed;
    public OptionsParameter() {
    }
@ -26,4 +27,12 @@ public class OptionsParameter {
    public boolean isParamRequired() {
        return paramRequired;
    }
    public boolean isCompressed() {
        return compressed;
    }
    public void setCompressed(boolean compressed) {
        this.compressed = compressed;
    }
 }
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java
@ -3,6 +3,10 @@ package eu.dnetlib.dhp.application;
 import org.apache.commons.io.IOUtils;
 import org.junit.Test;
 import java.io.ByteArrayOutputStream;
 import java.util.Base64;
 import java.util.zip.GZIPOutputStream;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
@ -24,6 +28,7 @@ public class ArgumentApplicationParserTest {
                "-ro", "value7",
                "-rr", "value8",
                "-w", "value9",
                "-cc", ArgumentApplicationParser.compressArgument(jsonConfiguration)
        });
        assertNotNull(parser.get("hdfsPath"));
        assertNotNull(parser.get("apidescriptor"));
@ -45,7 +50,12 @@ public class ArgumentApplicationParserTest {
        assertEquals("value7", parser.get("rabbitOngoingQueue"));
        assertEquals("value8", parser.get("rabbitReportQueue"));
        assertEquals("value9", parser.get("workflowId"));
        assertEquals(jsonConfiguration, parser.get("ccCoco"));
    }
 }
--- a/dhp-common/src/test/resources/eu/dnetlib/application/parameters.json
+++ b/dhp-common/src/test/resources/eu/dnetlib/application/parameters.json
@ -1,12 +1,13 @@
 [
-  {"paramName":"p",   "paramLongName":"hdfsPath",           "paramDescription": "the path where storing the sequential file",           "paramRequired": true},
+  {"paramName":"p",   "paramLongName":"hdfsPath",           "paramDescription": "the path where storing the sequential file",            "paramRequired": true},
-  {"paramName":"a",   "paramLongName":"apidescriptor",      "paramDescription": "the JSON encoding of the API Descriptor",              "paramRequired": true},
+  {"paramName":"a",   "paramLongName":"apidescriptor",      "paramDescription": "the JSON encoding of the API Descriptor",               "paramRequired": true},
-  {"paramName":"n",   "paramLongName":"namenode",           "paramDescription": "the Name Node URI",                                    "paramRequired": true},
+  {"paramName":"n",   "paramLongName":"namenode",           "paramDescription": "the Name Node URI",                                     "paramRequired": true},
-  {"paramName":"u",   "paramLongName":"userHDFS",           "paramDescription": "the user wich create the hdfs seq file",               "paramRequired": true},
+  {"paramName":"u",   "paramLongName":"userHDFS",           "paramDescription": "the user wich create the hdfs seq file",                "paramRequired": true},
-  {"paramName":"ru",  "paramLongName":"rabbitUser",         "paramDescription": "the user to connect with RabbitMq for messaging",      "paramRequired": true},
+  {"paramName":"ru",  "paramLongName":"rabbitUser",         "paramDescription": "the user to connect with RabbitMq for messaging",       "paramRequired": true},
-  {"paramName":"rp",  "paramLongName":"rabbitPassWord",     "paramDescription": "the password to connect with RabbitMq for messaging",  "paramRequired": true},
+  {"paramName":"rp",  "paramLongName":"rabbitPassWord",     "paramDescription": "the password to connect with RabbitMq for messaging",   "paramRequired": true},
-  {"paramName":"rh",  "paramLongName":"rabbitHost",         "paramDescription": "the host of the RabbitMq server",                      "paramRequired": true},
+  {"paramName":"rh",  "paramLongName":"rabbitHost",         "paramDescription": "the host of the RabbitMq server",                       "paramRequired": true},
-  {"paramName":"ro",  "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue",                        "paramRequired": true},
+  {"paramName":"ro",  "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue",                         "paramRequired": true},
-  {"paramName":"rr",  "paramLongName":"rabbitReportQueue",  "paramDescription": "the name of the report queue",                         "paramRequired": true},
+  {"paramName":"rr",  "paramLongName":"rabbitReportQueue",  "paramDescription": "the name of the report queue",                          "paramRequired": true},
-  {"paramName":"w",   "paramLongName":"workflowId",         "paramDescription": "the identifier of the dnet Workflow",                  "paramRequired": true}
+  {"paramName":"w",   "paramLongName":"workflowId",         "paramDescription": "the identifier of the dnet Workflow",                   "paramRequired": true},
  {"paramName":"cc",   "paramLongName":"ccCoco",            "paramDescription": "the identifier of the dnet Workflow", "compressed":true,"paramRequired": true}
 ]
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java
@ -84,4 +84,31 @@ public class Instance implements Serializable {
    public void setDateofacceptance(Field<String> dateofacceptance) {
        this.dateofacceptance = dateofacceptance;
    }
    public String toComparableString(){
        return String.format("%s::%s::%s::%s",
                hostedby != null && hostedby.getKey()!= null  ? hostedby.getKey().toLowerCase() : "",
                accessright!= null && accessright.getClassid()!= null ? accessright.getClassid() : "",
                instancetype!= null && instancetype.getClassid()!= null ? instancetype.getClassid() : "",
                url != null ? url:"");
    }
    @Override
    public int hashCode() {
        return toComparableString().hashCode();
    }
    @Override
    public boolean equals(Object obj) {
        if (this == obj)
            return true;
        if (obj == null)
            return false;
        if (getClass() != obj.getClass())
            return false;
        Instance other = (Instance) obj;
        return toComparableString()
                .equals(other.toComparableString());
    }
 }
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java
@ -6,6 +6,8 @@ import java.io.Serializable;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Optional;
 import java.util.stream.Collectors;
 public abstract class Result extends OafEntity implements Serializable {
@ -251,13 +253,12 @@ public abstract class Result extends OafEntity implements Serializable {
        Result r = (Result) e;
-        mergeAuthors(r.getAuthor());
+
        //TODO mergeFrom is used only for create Dedup Records since the creation of these two fields requires more complex functions (maybe they will be filled in an external function)
 //        if (author == null)
 //            author = r.getAuthor(); //authors will be replaced because they could be too much
 //        dateofacceptance = r.getDateofacceptance();
-//        instance = mergeLists(instance, r.getInstance());
+
        instance = mergeLists(instance, r.getInstance());
        if (r.getResulttype() != null)
            resulttype = r.getResulttype();
@ -309,80 +310,5 @@ public abstract class Result extends OafEntity implements Serializable {
    }
    public void mergeAuthors(List<Author> authors){
        int c1 = countAuthorsPids(author);
        int c2 = countAuthorsPids(authors);
        int s1 = authorsSize(author);
        int s2 = authorsSize(authors);
        //if both have no authors with pids and authors is bigger than author
        if (c1 == 0 && c2 == 0 && author.size()<authors.size()) {
            author = authors;
            return;
        }
        //author is null and authors have 0 or more authors with pids
        if (c1<c2 && c1<0) {
            author = authors;
            return;
        }
        //andiamo a mangiare
 //        if (author == null && authors == null)
 //            return;
 //
 //        int c1 = countAuthorsPids(author);
 //        int c2 = countAuthorsPids(authors);
 //
 //        if (c1<c2 && c1<1){
 //            author = authors;
 //            return;
 //        }
 //
 //        if (c1<c2)
    }
    public int countAuthorsPids(List<Author> authors){
        if (authors == null)
            return -1;
        return (int) authors.stream().map(this::extractAuthorPid).filter(Objects::nonNull).filter(StringUtils::isNotBlank).count();
    }
    public int authorsSize(List<Author> authors){
        if (authors == null)
            return 0;
        return authors.size();
    }
    public String extractAuthorPid(Author a){
        if(a == null || a.getPid() == null || a.getPid().size() == 0)
            return null;
        StringBuilder mainPid = new StringBuilder();
        a.getPid().forEach(pid ->{
            if (pid.getQualifier().getClassid().equalsIgnoreCase("orcid")) {
                mainPid.setLength(0);
                mainPid.append(pid.getValue());
            }
            else {
                if(mainPid.length() == 0)
                    mainPid.append(pid.getValue());
            }
        });
        return mainPid.toString();
    }
 }
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl
@ -9,7 +9,7 @@
            <xsl:copy-of select="//oai:header"/>
            <metadata>
                <xsl:for-each select="//*[local-name()='subject']">
-                    <subject><xsl:dedupId-of select="eg:clean(.,'dnet:languages')"/></subject>
+                    <subject><xsl:value-of select="eg:clean(.,'dnet:languages')"/></subject>
                </xsl:for-each>
            </metadata>
            <oaf:about>
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml
@ -1,11 +1,11 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <RESOURCE_PROFILE>
    <HEADER>
-        <RESOURCE_IDENTIFIER dedupId="d6fa79f2-486e-482d-b37c-62129af2cd9a_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU="/>
+        <RESOURCE_IDENTIFIER value="d6fa79f2-486e-482d-b37c-62129af2cd9a_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU="/>
-        <RESOURCE_TYPE dedupId="TransformationRuleDSResourceType"/>
+        <RESOURCE_TYPE value="TransformationRuleDSResourceType"/>
-        <RESOURCE_KIND dedupId="TransformationRuleDSResources"/>
+        <RESOURCE_KIND value="TransformationRuleDSResources"/>
-        <RESOURCE_URI dedupId=""/>
+        <RESOURCE_URI value=""/>
-        <DATE_OF_CREATION dedupId="2019-04-11T11:15:30+00:00"/>
+        <DATE_OF_CREATION value="2019-04-11T11:15:30+00:00"/>
    </HEADER>
    <BODY>
        <CONFIGURATION>
@ -24,7 +24,7 @@
                                <xsl:copy-of select="//oai:header"/>
                                <metadata>
                                    <xsl:for-each select="//*[local-name()='subject']">
-                                        <subject><xsl:dedupId-of select="eg:clean(.,'dnet:languages')"/></subject>
+                                        <subject><xsl:value-of select="eg:clean(.,'dnet:languages')"/></subject>
                                    </xsl:for-each>
                                </metadata>
                                <oaf:about>
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java
@ -0,0 +1,119 @@
 package eu.dnetlib.dedup;
 import eu.dnetlib.dhp.schema.oaf.Field;
 import org.apache.commons.lang.StringUtils;
 import java.time.Year;
 import java.util.*;
 import java.util.stream.Collectors;
 import static java.util.Collections.reverseOrder;
 import static java.util.Map.Entry.comparingByValue;
 import static java.util.stream.Collectors.toMap;
 import static org.apache.commons.lang.StringUtils.endsWith;
 import static org.apache.commons.lang.StringUtils.substringBefore;
 public class DatePicker {
    private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";
    private static final String DATE_DEFAULT_SUFFIX = "01-01";
    private static final int YEAR_LB = 1300;
    private static final int YEAR_UB = Year.now().getValue() + 5;
    public static Field<String> pick(final Collection<String> dateofacceptance) {
        final Map<String, Integer> frequencies = dateofacceptance
                .parallelStream()
                .filter(StringUtils::isNotBlank)
                .collect(
                        Collectors.toConcurrentMap(
                                w -> w, w -> 1, Integer::sum));
        if (frequencies.isEmpty()) {
            return new Field<>();
        }
        final Field<String> date = new Field<>();
                date.setValue(frequencies.keySet().iterator().next());
        // let's sort this map by values first, filtering out invalid dates
        final Map<String, Integer> sorted = frequencies
                .entrySet()
                .stream()
                .filter(d -> StringUtils.isNotBlank(d.getKey()))
                .filter(d -> d.getKey().matches(DATE_PATTERN))
                .filter(d -> inRange(d.getKey()))
                .sorted(reverseOrder(comparingByValue()))
                .collect(
                        toMap(
                                Map.Entry::getKey,
                                Map.Entry::getValue, (e1, e2) -> e2,
                                LinkedHashMap::new));
        // shortcut
        if (sorted.size() == 0) {
            return date;
        }
        // voting method (1/3 + 1) wins
        if (sorted.size() >= 3) {
            final int acceptThreshold = (sorted.size() / 3) + 1;
            final List<String> accepted = sorted.entrySet().stream()
                    .filter(e -> e.getValue() >= acceptThreshold)
                    .map(e -> e.getKey())
                    .collect(Collectors.toList());
            // cannot find strong majority
            if (accepted.isEmpty()) {
                final int max = sorted.values().iterator().next();
                Optional<String> first = sorted.entrySet().stream()
                        .filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX))
                        .map(Map.Entry::getKey)
                        .findFirst();
                if (first.isPresent()) {
                    date.setValue(first.get());
                    return date;
                }
                date.setValue(sorted.keySet().iterator().next());
                return date;
            }
            if (accepted.size() == 1) {
                date.setValue(accepted.get(0));
                return date;
            } else {
                final Optional<String> first = accepted.stream()
                        .filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX))
                        .findFirst();
                if (first.isPresent()) {
                    date.setValue(first.get());
                    return date;
                }
                return date;
            }
            //1st non YYYY-01-01 is returned
        } else {
            if (sorted.size() == 2) {
                for (Map.Entry<String, Integer> e : sorted.entrySet()) {
                    if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) {
                        date.setValue(e.getKey());
                        return date;
                    }
                }
            }
            // none of the dates seems good enough, return the 1st one
            date.setValue(sorted.keySet().iterator().next());
            return date;
        }
    }
    private static boolean inRange(final String date) {
        final int year = Integer.parseInt(substringBefore(date, "-"));
        return year >= YEAR_LB && year <= YEAR_UB;
    }
 }
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java
@ -1,6 +1,5 @@
 package eu.dnetlib.dedup;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.pace.config.DedupConfig;
@ -10,23 +9,20 @@ import org.apache.commons.lang.StringUtils;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import org.codehaus.jackson.map.ObjectMapper;
 import scala.Tuple2;
 import java.io.IOException;
 import java.util.Collection;
 import java.util.List;
 import java.util.Random;
 import static java.util.stream.Collectors.toMap;
 public class DedupRecordFactory {
-    public JavaRDD<OafEntity> createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf){
+    public static JavaRDD<OafEntity> createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf){
        //<id, json_entity>
        final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(entitiesInputPath)
@ -51,7 +47,12 @@ public class DedupRecordFactory {
            String idValue = json._1();
-            String trust = MapDocumentUtil.getJPathString("$.dataInfo.trust", json._2());
+            String trust ="";
            try {
                trust = MapDocumentUtil.getJPathString("$.dataInfo.trust", json._2());
            } catch (Throwable e) {
            }
            //TODO remember to replace this with the actual trust retrieving
            if (StringUtils.isBlank(trust)) {
@ -71,28 +72,32 @@ public class DedupRecordFactory {
                .groupByKey();
        switch(entityType){
-            case Publication:
+            case publication:
-                return sortedJoinResult.map(this::publicationMerger);
+                return sortedJoinResult.map(DedupRecordFactory::publicationMerger);
-            case Dataset:
+            case dataset:
-                return sortedJoinResult.map(this::datasetMerger);
+                return sortedJoinResult.map(DedupRecordFactory::datasetMerger);
-            case Project:
+            case project:
-                return sortedJoinResult.map(this::projectMerger);
+                return sortedJoinResult.map(DedupRecordFactory::projectMerger);
-            case Software:
+            case software:
-                return sortedJoinResult.map(this::softwareMerger);
+                return sortedJoinResult.map(DedupRecordFactory::softwareMerger);
-            case Datasource:
+            case datasource:
-                return sortedJoinResult.map(this::datasourceMerger);
+                return sortedJoinResult.map(DedupRecordFactory::datasourceMerger);
-            case Organization:
+            case organization:
-                return sortedJoinResult.map(this::organizationMerger);
+                return sortedJoinResult.map(DedupRecordFactory::organizationMerger);
-            case OtherResearchProduct:
+            case otherresearchproduct:
-                return sortedJoinResult.map(this::otherresearchproductMerger);
+                return sortedJoinResult.map(DedupRecordFactory::otherresearchproductMerger);
            default:
                return null;
        }
    }
-    private Publication publicationMerger(Tuple2<String, Iterable<String>> e){
+    private static Publication publicationMerger(Tuple2<String, Iterable<String>> e){
        Publication p = new Publication(); //the result of the merge, to be returned at the end
@ -101,67 +106,59 @@ public class DedupRecordFactory {
        final ObjectMapper mapper = new ObjectMapper();
        final Collection<String> dateofacceptance = Lists.newArrayList();
-        final Collection<List<Author>> authors = Lists.newArrayList();
+
        final Collection<List<Instance>> instances = Lists.newArrayList();
        StringBuilder trust = new StringBuilder("0.0");
        if (e._2() != null)
        e._2().forEach(pub -> {
            try {
                Publication publication = mapper.readValue(pub, Publication.class);
                final String currentTrust = publication.getDataInfo().getTrust();
-                if (!currentTrust.equals("1.0")) {
+                if (!"1.0".equals(currentTrust)) {
                    trust.setLength(0);
                    trust.append(currentTrust);
                }
                p.mergeFrom(publication);
-
+                p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor()));
                //add to the list if they are not null
                if (publication.getDateofacceptance() != null)
                    dateofacceptance.add(publication.getDateofacceptance().getValue());
-                if (publication.getAuthor() != null)
+            } catch (Exception exc){
-                    authors.add(publication.getAuthor());
+                throw  new RuntimeException(exc);
-                if (publication.getInstance() != null)
+            }
                    instances.add(publication.getInstance());
            } catch (Exception exc){}
        });
-
+        p.setDateofacceptance(DatePicker.pick(dateofacceptance));
        p.setAuthor(null); //TODO create a single list of authors to put in the final publication
        return p;
    }
-    private Dataset datasetMerger(Tuple2<String, Iterable<String>> e){
+    private static Dataset datasetMerger(Tuple2<String, Iterable<String>> e){
        throw new NotImplementedException();
    }
-    private Project projectMerger(Tuple2<String, Iterable<String>> e){
+    private static Project projectMerger(Tuple2<String, Iterable<String>> e){
        throw new NotImplementedException();
    }
-    private Software softwareMerger(Tuple2<String, Iterable<String>> e){
+    private static Software softwareMerger(Tuple2<String, Iterable<String>> e){
        throw new NotImplementedException();
    }
-    private Datasource datasourceMerger(Tuple2<String, Iterable<String>> e){
+    private static Datasource datasourceMerger(Tuple2<String, Iterable<String>> e){
        throw new NotImplementedException();
    }
-    private Organization organizationMerger(Tuple2<String, Iterable<String>> e){
+    private static Organization organizationMerger(Tuple2<String, Iterable<String>> e){
        throw new NotImplementedException();
    }
-    private OtherResearchProduct otherresearchproductMerger(Tuple2<String, Iterable<String>> e){
+    private static OtherResearchProduct otherresearchproductMerger(Tuple2<String, Iterable<String>> e){
        throw new NotImplementedException();
    }
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java
@ -1,11 +1,17 @@
 package eu.dnetlib.dedup;
 import com.google.common.collect.Sets;
 import com.wcohen.ss.JaroWinkler;
 import eu.dnetlib.dhp.schema.oaf.Author;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.model.MapDocument;
 import eu.dnetlib.pace.model.Person;
 import org.apache.commons.codec.binary.Hex;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
@ -14,32 +20,35 @@ import org.apache.spark.SparkContext;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.util.LongAccumulator;
 import scala.Tuple2;
 import java.io.IOException;
 import java.io.StringWriter;
 import java.nio.charset.StandardCharsets;
 import java.security.MessageDigest;
-import java.util.HashMap;
+import java.text.Normalizer;
-import java.util.Map;
+import java.util.*;
-import java.util.Set;
+import java.util.stream.Collectors;
 import java.util.stream.Stream;
 public class DedupUtility {
    private static final Double THRESHOLD = 0.95;
    public static Map<String, LongAccumulator> constructAccumulator(final DedupConfig dedupConf, final SparkContext context) {
        Map<String, LongAccumulator> accumulators = new HashMap<>();
-        String acc1 = String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1");
+        String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1");
        accumulators.put(acc1, context.longAccumulator(acc1));
-        String acc2 = String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField());
+        String acc2 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField());
        accumulators.put(acc2, context.longAccumulator(acc2));
-        String acc3 = String.format("%s::%s",dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()));
+        String acc3 = String.format("%s::%s", dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()));
        accumulators.put(acc3, context.longAccumulator(acc3));
-        String acc4 = String.format("%s::%s",dedupConf.getWf().getEntityType(), "skip list");
+        String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list");
        accumulators.put(acc4, context.longAccumulator(acc4));
-        String acc5 = String.format("%s::%s",dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)");
+        String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)");
        accumulators.put(acc5, context.longAccumulator(acc5));
-        String acc6 = String.format("%s::%s",dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold());
+        String acc6 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold());
        accumulators.put(acc6, context.longAccumulator(acc6));
        return accumulators;
@ -52,7 +61,7 @@ public class DedupUtility {
    public static void deleteIfExists(String path) throws IOException {
        Configuration conf = new Configuration();
        FileSystem fileSystem = FileSystem.get(conf);
-        if (fileSystem.exists(new Path(path))){
+        if (fileSystem.exists(new Path(path))) {
            fileSystem.delete(new Path(path), true);
        }
    }
@ -91,4 +100,171 @@ public class DedupUtility {
            return null;
        }
    }
    public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
        int pa = countAuthorsPids(a);
        int pb = countAuthorsPids(b);
        List<Author> base, enrich;
        int sa = authorsSize(a);
        int sb = authorsSize(b);
        if(pa == pb){
            base = sa>sb?a:b;
            enrich = sa>sb?b:a;
        } else {
            base = pa>pb?a:b;
            enrich = pa>pb?b:a;
        }
        enrichPidFromList(base, enrich);
        return base;
 //        //if both have no authors with pids
 //        if (pa < 1 && pb < 1) {
 //            //B is bigger than A
 //            if (sa < sb)
 //                return b;
 //                //A is bigger than B
 //            else
 //                return a;
 //        }
 //        //If A has author with pids
 //        if (pa > 0) {
 //            //B has no author with pid
 //            if (pb < 1)
 //                return a;
 //                //B has author with pid
 //            else {
 //                enrichPidFromList(a, b);
 //                return a;
 //            }
 //        }
 //        //If B has author with pids
 //        //A has no author with pid
 //        if (pa < 1)
 //            return b;
 //            //A has author with pid
 //        else {
 //            enrichPidFromList(b, a);
 //            return b;
 //        }
    }
    private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
        if(base==null || enrich == null)
            return;
        final Map<String, Author> basePidAuthorMap = base.stream()
                .filter(a -> a.getPid() != null && a.getPid().size() > 0)
                .flatMap(a -> a.getPid()
                        .stream()
                        .map(p -> new Tuple2<>(p.toComparableString(), a))
                ).collect(Collectors.toMap(Tuple2::_1, Tuple2::_2));
        final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
                .stream()
                .filter(a -> a.getPid() != null && a.getPid().size() > 0)
                .flatMap(a -> a.getPid().stream().filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())).map(p -> new Tuple2<>(p, a)))
                .collect(Collectors.toList());
        pidToEnrich.forEach(a -> {
            Optional<Tuple2<Double, Author>> simAuhtor = base.stream().map(ba -> new Tuple2<>(sim(ba, a._2()), ba)).max(Comparator.comparing(Tuple2::_1));
            if (simAuhtor.isPresent() && simAuhtor.get()._1()> THRESHOLD) {
                Author r = simAuhtor.get()._2();
                r.getPid().add(a._1());
            }
        });
    }
    public static String createEntityPath(final String basePath, final String entityType) {
        return String.format("%s/%s", basePath,entityType);
    }
    public static String createSimRelPath(final String basePath, final String entityType) {
        return String.format("%s/%s_simRel", basePath,entityType);
    }
    public static String createMergeRelPath(final String basePath, final String entityType) {
        return String.format("%s/%s_mergeRel", basePath,entityType);
    }
    private static Double sim(Author a, Author b) {
        final Person pa = parse(a);
        final Person pb = parse(b);
        if (pa.isAccurate() & pb.isAccurate()) {
            return new JaroWinkler().score(
                    normalize(pa.getSurnameString()),
                    normalize(pb.getSurnameString()));
        } else {
            return new JaroWinkler().score(
                    normalize(pa.getNormalisedFullname()),
                    normalize(pb.getNormalisedFullname()));
        }
    }
    private static String normalize(final String s) {
        return nfd(s).toLowerCase()
                // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
                .replaceAll("(\\W)+", " ")
                .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
                .replaceAll("(\\p{Punct})+", " ")
                .replaceAll("(\\d)+", " ")
                .replaceAll("(\\n)+", " ")
                .trim();
    }
    private static String nfd(final String s) {
        return Normalizer.normalize(s, Normalizer.Form.NFD);
    }
    private static Person parse(Author author) {
        if (StringUtils.isNotBlank(author.getSurname())) {
            return new Person(author.getSurname() + ", " + author.getName(), false);
        } else {
            return new Person(author.getFullname(), false);
        }
    }
    private static int countAuthorsPids(List<Author> authors) {
        if (authors == null)
            return 0;
        return (int) authors.stream().map(DedupUtility::extractAuthorPid).filter(Objects::nonNull).filter(StringUtils::isNotBlank).count();
    }
    private static int authorsSize(List<Author> authors) {
        if (authors == null)
            return 0;
        return authors.size();
    }
    private static boolean isAccurate(final Author a) {
        return StringUtils.isNotBlank(a.getName()) && StringUtils.isNotBlank(a.getSurname());
    }
    private static String extractAuthorPid(Author a) {
        if (a == null || a.getPid() == null || a.getPid().size() == 0)
            return null;
        StringBuilder mainPid = new StringBuilder();
        a.getPid().forEach(pid -> {
            if (pid.getQualifier().getClassid().equalsIgnoreCase("orcid")) {
                mainPid.setLength(0);
                mainPid.append(pid.getValue());
            } else {
                if (mainPid.length() == 0)
                    mainPid.append(pid.getValue());
            }
        });
        return mainPid.toString();
    }
 }
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java
@ -2,12 +2,14 @@ package eu.dnetlib.dedup;
 public enum OafEntityType {
-    Datasource,
+    datasource,
-    Organization,
+    organization,
-    Project,
+    project,
-    Dataset,
+    dataset,
-    OtherResearchProduct,
+    otherresearchproduct,
-    Software,
+    software,
-    Publication
+    publication
 }
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java
@ -46,7 +46,7 @@ public class SparkCreateConnectedComponent {
                        s -> new Tuple2<Object, String>((long) s.hashCode(), s)
                );
-        final Dataset<Relation> similarityRelations = spark.read().load(targetPath + "/" + entity+"_simrel").as(Encoders.bean(Relation.class));
+        final Dataset<Relation> similarityRelations = spark.read().load(DedupUtility.createSimRelPath(targetPath,entity)).as(Encoders.bean(Relation.class));
        final RDD<Edge<String>> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(it.getSource().hashCode(), it.getTarget().hashCode(), it.getRelClass())).rdd();
@ -73,7 +73,7 @@ public class SparkCreateConnectedComponent {
                            return tmp.stream();
                        }).iterator()).rdd(), Encoders.bean(Relation.class));
-        mergeRelation.write().mode("overwrite").save(targetPath+"/"+entity+"_mergeRels");
+        mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(targetPath,entity));
    }
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java
@ -1,71 +1,39 @@
 package eu.dnetlib.dedup;
-import com.google.common.collect.ComparisonChain;
+import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.collect.Lists;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.OafEntity;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.Partitioner;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.Optional;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import org.apache.spark.sql.types.StructType;
 import scala.Tuple2;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.List;
 public class SparkCreateDedupRecord {
    public static void main(String[] args) throws Exception {
-//        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json")));
-//        parser.parseArgument(args);
+        parser.parseArgument(args);
-//        final SparkSession spark = SparkSession
+        final SparkSession spark = SparkSession
-//                .builder()
+                .builder()
-//                .appName(SparkCreateDedupRecord.class.getSimpleName())
+                .appName(SparkCreateDedupRecord.class.getSimpleName())
-//                .master(parser.get("master"))
+                .master(parser.get("master"))
-//                .getOrCreate();
+                .getOrCreate();
 //
 //        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 //        final String inputPath = parser.get("sourcePath");
 //        final String entity = parser.get("entity");
 //        final String targetPath = parser.get("targetPath");
 ////        final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
 //        final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
 //
 //        //<id, json_entity>
 //        final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(inputPath + "/" + entity)
 //                .mapToPair((PairFunction<String,String,String>)it->
 //                    new Tuple2<String,String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it)
 //                );
-//        //<source, target>: source is the dedup_id, target is the id of the mergedIn
+        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-//        JavaPairRDD<String,String> mergeRels = spark
+        final String sourcePath = parser.get("sourcePath");
-//                .read().load(targetPath + "/" + entity+"_mergeRels").as(Encoders.bean(Relation.class))
+        final String entity = parser.get("entity");
-//                .where("relClass=='merges'")
+        final String dedupPath = parser.get("dedupPath");
-//                .javaRDD()
+//        final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
-//                .mapToPair(
+        final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
-//                        (PairFunction<Relation, String,String>)r->
+
-//                                new Tuple2<String,String>(r.getTarget(), r.getSource())
+        final JavaRDD<OafEntity> dedupRecord = DedupRecordFactory.createDedupRecord(sc, spark, DedupUtility.createMergeRelPath(dedupPath,entity), DedupUtility.createEntityPath(sourcePath,entity), OafEntityType.valueOf(entity), dedupConf);
-//                );
+        dedupRecord.map(r-> {
-//
+            ObjectMapper mapper = new ObjectMapper();
-//        //<dedup_id, json_entity_merged>
+            return mapper.writeValueAsString(r);
-//        final JavaPairRDD<String, String> p = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
+        }).saveAsTextFile(dedupPath+"/"+entity+"_dedup_record_json");
        StructType schema = Encoders.bean(Publication.class).schema();
        System.out.println(schema);
    }
 }
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java
@ -77,7 +77,7 @@ public class SparkCreateSimRels {
            return r;
        });
-        spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save(targetPath+"/"+entity+"_simrel");
+        spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(targetPath,entity));
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json
@ -0,0 +1,33 @@
 [
  {
    "paramName": "mt",
    "paramLongName": "master",
    "paramDescription": "should be local or yarn",
    "paramRequired": true
  },
  {
    "paramName": "s",
    "paramLongName": "sourcePath",
    "paramDescription": "the path of the sequential file to read",
    "paramRequired": true
  },
  {
    "paramName": "e",
    "paramLongName": "entity",
    "paramDescription": "the type of entity to be deduped",
    "paramRequired": true
  },
  {
    "paramName": "c",
    "paramLongName": "dedupConf",
    "paramDescription": "dedup configuration to be used",
    "compressed": true,
    "paramRequired": true
  },
  {
    "paramName": "d",
    "paramLongName": "dedupPath",
    "paramDescription": "dedup path to load mergeRelation",
    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json
@ -1,7 +1,33 @@
 [
-  {"paramName":"mt",  "paramLongName":"master",      "paramDescription": "should be local or yarn",                  "paramRequired": true},
+  {
-  {"paramName":"s",   "paramLongName":"sourcePath",  "paramDescription": "the path of the sequential file to read",  "paramRequired": true},
+    "paramName": "mt",
-  {"paramName":"e",   "paramLongName":"entity",      "paramDescription": "the type of entity to be deduped",         "paramRequired": true},
+    "paramLongName": "master",
-  {"paramName":"c",   "paramLongName":"dedupConf",   "paramDescription": "dedup configuration to be used",           "paramRequired": true},
+    "paramDescription": "should be local or yarn",
-  {"paramName":"t",   "paramLongName":"targetPath",  "paramDescription": "target path to save dedup result",         "paramRequired": true}
+    "paramRequired": true
  },
  {
    "paramName": "s",
    "paramLongName": "sourcePath",
    "paramDescription": "the path of the sequential file to read",
    "paramRequired": true
  },
  {
    "paramName": "e",
    "paramLongName": "entity",
    "paramDescription": "the type of entity to be deduped",
    "paramRequired": true
  },
  {
    "paramName": "c",
    "paramLongName": "dedupConf",
    "paramDescription": "dedup configuration to be used",
    "compressed": true,
    "paramRequired": true
  },
  {
    "paramName": "t",
    "paramLongName": "targetPath",
    "paramDescription": "target path to save dedup result",
    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml
@ -93,6 +93,31 @@
            <arg>--entity</arg><arg>${entity}</arg>
            <arg>--dedupConf</arg><arg>${dedupConf}</arg>
        </spark>
        <ok to="CreateDedupRecord"/>
        <error to="Kill"/>
    </action>
    <action name="CreateDedupRecord">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Create Connected Components</name>
            <class>eu.dnetlib.dedup.SparkCreateDedupRecord</class>
            <jar>dhp-dedup-${projectVersion}.jar</jar>
            <spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory} --conf
                spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
                spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
                spark.sql.warehouse.dir="/user/hive/warehouse"
            </spark-opts>
            <arg>-mt</arg><arg>yarn-cluster</arg>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--dedupPath</arg><arg>${dedupPath}</arg>
            <arg>--entity</arg><arg>${entity}</arg>
            <arg>--dedupConf</arg><arg>${dedupConf}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
--- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java
+++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java
@ -0,0 +1,61 @@
 package eu.dnetlib.dedup;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import org.apache.commons.io.IOUtils;
 import org.codehaus.jackson.map.ObjectMapper;
 import org.junit.Before;
 import org.junit.Test;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.List;
 import java.util.stream.Collectors;
 public class MergeAuthorTest {
    List<Publication> publicationsToMerge;
    final ObjectMapper mapper = new ObjectMapper();
    @Before
    public void setUp() throws Exception {
        final String json = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dedup/json/authors_merge.json"));
        publicationsToMerge = Arrays.asList(json.split("\n")).stream().map(s-> {
            try {
                return mapper.readValue(s, Publication.class);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }).collect(Collectors.toList());
    }
    @Test
    public void test() throws  Exception {
        Publication dedup = new Publication();
        publicationsToMerge.forEach(p-> {
            dedup.mergeFrom(p);
            dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(),p.getAuthor()));
        });
        System.out.println(mapper.writeValueAsString(dedup));
    }
 }
--- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java
+++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java
@ -1,6 +1,7 @@
 package eu.dnetlib.dedup;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
@ -27,9 +28,9 @@ public class SparkCreateDedupTest {
    public void createSimRelsTest() throws Exception {
        SparkCreateSimRels.main(new String[] {
                "-mt", "local[*]",
-                "-s", "/Users/miconis/dumps",
+                "-s", "/home/sandro/betadump",
                "-e", "publication",
-                "-c", configuration,
+                "-c", ArgumentApplicationParser.compressArgument(configuration),
                "-t", "/tmp/dedup",
        });
    }
@ -40,9 +41,9 @@ public class SparkCreateDedupTest {
        SparkCreateConnectedComponent.main(new String[] {
                "-mt", "local[*]",
-                "-s", "/Users/miconis/dumps",
+                "-s", "/home/sandro/betadump",
                "-e", "publication",
-                "-c", configuration,
+                "-c", ArgumentApplicationParser.compressArgument(configuration),
                "-t", "/tmp/dedup",
        });
    }
@ -52,10 +53,18 @@ public class SparkCreateDedupTest {
    public void dedupRecordTest() throws Exception {
        SparkCreateDedupRecord.main(new String[] {
                "-mt", "local[*]",
-                "-s", "/Users/miconis/dumps",
+                "-s", "/home/sandro/betadump",
                "-e", "publication",
-                "-c", configuration,
+                "-c", ArgumentApplicationParser.compressArgument(configuration),
-                "-t", "/tmp/dedup",
+                "-d", "/tmp/dedup",
        });
    }
    @Test
    public void printCC() throws Exception {
        System.out.println(ArgumentApplicationParser.compressArgument(configuration));
    }
 }
--- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json
+++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json
--- a/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/config-default.xml
@ -1,18 +1,18 @@
 <configuration>
    <property>
        <name>jobTracker</name>
-        <dedupId>yarnRM</dedupId>
+        <value>yarnRM</value>
    </property>
    <property>
        <name>nameNode</name>
-        <dedupId>hdfs://nameservice1</dedupId>
+        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>sourceNN</name>
-        <dedupId>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</dedupId>
+        <value>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</value>
    </property>
    <property>
        <name>oozie.use.system.libpath</name>
-        <dedupId>true</dedupId>
+        <value>true</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/workflow.xml
@ -14,12 +14,12 @@
        </property>
        <property>
            <name>hbase_dump_distcp_memory_mb</name>
-            <dedupId>6144</dedupId>
+            <value>6144</value>
            <description>memory for distcp action copying InfoSpace dump from remote cluster</description>
        </property>
        <property>
            <name>hbase_dump_distcp_num_maps</name>
-            <dedupId>1</dedupId>
+            <value>1</value>
            <description>maximum number of simultaneous copies of InfoSpace dump from remote location</description>
        </property>
    </parameters>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml
@ -1,26 +1,26 @@
 <configuration>
    <property>
        <name>jobTracker</name>
-        <dedupId>yarnRM</dedupId>
+        <value>yarnRM</value>
    </property>
    <property>
        <name>nameNode</name>
-        <dedupId>hdfs://nameservice1</dedupId>
+        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>oozie.use.system.libpath</name>
-        <dedupId>true</dedupId>
+        <value>true</value>
    </property>
    <property>
        <name>oozie.action.sharelib.for.spark</name>
-        <dedupId>spark2</dedupId>
+        <value>spark2</value>
    </property>
    <property>
        <name>hive_metastore_uris</name>
-        <dedupId>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</dedupId>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
    </property>
    <property>
        <name>hive_db_name</name>
-        <dedupId>openaire</dedupId>
+        <value>openaire</value>
    </property>
 </configuration>
--- a/dhp-workflows/docs/oozie-installer.markdown
+++ b/dhp-workflows/docs/oozie-installer.markdown
@ -54,7 +54,7 @@ Properties overriding order is the following:
 2. `~/.dhp/application.properties` defined properties
 3. `${workflow.source.dir}/job.properties`
 4. `job-override.properties` (located in the project root dir)
-5. `maven -Dparam=dedupId`
+5. `maven -Dparam=value`
 where the maven `-Dparam` property is overriding all the other ones.
@ -73,7 +73,7 @@ Workflow definition requirements
 This property can be set using maven `-D` switch.
-`[oozie_app]` is the default directory name however it can be set to any dedupId as soon as `oozieAppDir` property is provided with directory name as dedupId.
+`[oozie_app]` is the default directory name however it can be set to any value as soon as `oozieAppDir` property is provided with directory name as value.
 Subworkflows are supported as well and subworkflow directories should be nested within `[oozie_app]` directory. 
--- a/dhp-workflows/pom.xml
+++ b/dhp-workflows/pom.xml
@ -73,7 +73,7 @@
            <!-- This profile sets properties that are required for test oozie workflows To be used only with 'oozie-package' profile -->
            <id>attach-test-resources</id>
            <properties>
-                <!--overriding default scope (set to 'runtime') with the 'test' dedupId. Test resources attached to oozie package requires all test dependencies. -->
+                <!--overriding default scope (set to 'runtime') with the 'test' value. Test resources attached to oozie package requires all test dependencies. -->
                <oozie.package.dependencies.include.scope />
                <oozie.package.dependencies.exclude.scope>provided</oozie.package.dependencies.exclude.scope>
                <!-- Do not skip creation of test jar for priming (in oozie-package profile) -->
@ -326,7 +326,7 @@
                                </goals>
                                <configuration>
                                    <tasks>
-                                        <property name="assembly-resources.loc" dedupId="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" />
+                                        <property name="assembly-resources.loc" value="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" />
                                        <unjar src="${assembly-resources.loc}" dest="${project.build.directory}/assembly-resources" />
                                    </tasks>
                                </configuration>
--- a/pom.xml
+++ b/pom.xml
@ -236,6 +236,11 @@
                <artifactId>java-jq</artifactId>
                <version>0.10.1</version>
            </dependency>
            <dependency>
                <groupId>edu.cmu</groupId>
                <artifactId>secondstring</artifactId>
                <version>1.0.0</version>
            </dependency>
            <dependency>
                <groupId>org.apache.oozie</groupId>