Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop

2020-03-27 11:47:07 +01:00 · 2020-03-27 11:47:07 +01:00 · ae03948eed
parent f6e86b44a6 673e744649
commit ae03948eed
161 changed files with 6471 additions and 311 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,9 +1,12 @@
 .DS_Store
 .idea
+*.iws
+*.ipr
 *.iml
 *.ipr
 *.iws
 *~
+.vscode
 .classpath
 /*/.classpath
 /*/*/.classpath
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -58,6 +58,15 @@
 			<groupId>eu.dnetlib</groupId>
 			<artifactId>cnr-rmi-api</artifactId>
 		</dependency>
+
+		<dependency>
+			<groupId>com.ximpleware</groupId>
+			<artifactId>vtd-xml</artifactId>
+		</dependency>
+		<dependency>
+			<groupId>com.jayway.jsonpath</groupId>
+			<artifactId>json-path</artifactId>
+		</dependency>
 	</dependencies>

 </project>
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdException.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdException.java
@ -0,0 +1,12 @@
+package eu.dnetlib.dhp.parser.utility;
+
+public class VtdException extends Exception {
+
+    public VtdException(final Exception e) {
+        super(e);
+    }
+
+    public VtdException(final Throwable e) {
+        super(e);
+    }
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java
@ -0,0 +1,107 @@
+package eu.dnetlib.dhp.parser.utility;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+
+import com.ximpleware.AutoPilot;
+import com.ximpleware.VTDNav;
+
+/**
+ * Created by sandro on 9/29/16.
+ */
+public class VtdUtilityParser {
+
+    public static List<Node> getTextValuesWithAttributes(final AutoPilot ap, final VTDNav vn, final String xpath, final List<String> attributes)
+            throws VtdException {
+        final List<Node> results = new ArrayList<>();
+        try {
+            ap.selectXPath(xpath);
+
+            while (ap.evalXPath() != -1) {
+                final Node currentNode = new Node();
+                int t = vn.getText();
+                if (t >= 0) {
+                    currentNode.setTextValue(vn.toNormalizedString(t));
+                }
+                currentNode.setAttributes(getAttributes(vn, attributes));
+                results.add(currentNode);
+            }
+            return results;
+        } catch (Exception e) {
+            throw new VtdException(e);
+        }
+    }
+
+    private static Map<String, String> getAttributes(final VTDNav vn, final List<String> attributes) {
+        final Map<String, String> currentAttributes = new HashMap<>();
+        if (attributes != null) {
+
+            attributes.forEach(attributeKey -> {
+                try {
+                    int attr = vn.getAttrVal(attributeKey);
+                    if (attr > -1) {
+                        currentAttributes.put(attributeKey, vn.toNormalizedString(attr));
+                    }
+                } catch (Throwable e) {
+                    throw new RuntimeException(e);
+                }
+            });
+        }
+        return currentAttributes;
+    }
+
+    public static List<String> getTextValue(final AutoPilot ap, final VTDNav vn, final String xpath) throws VtdException {
+        List<String> results = new ArrayList<>();
+        try {
+            ap.selectXPath(xpath);
+            while (ap.evalXPath() != -1) {
+                int t = vn.getText();
+                if (t > -1) results.add(vn.toNormalizedString(t));
+            }
+            return results;
+        } catch (Exception e) {
+            throw new VtdException(e);
+        }
+    }
+
+    public static String getSingleValue(final AutoPilot ap, final VTDNav nav, final String xpath) throws VtdException {
+        try {
+            ap.selectXPath(xpath);
+            while (ap.evalXPath() != -1) {
+                int it = nav.getText();
+                if (it > -1)
+                    return nav.toNormalizedString(it);
+            }
+            return null;
+        } catch (Exception e) {
+            throw new VtdException(e);
+        }
+    }
+
+    public static class Node {
+
+        private String textValue;
+
+        private Map<String, String> attributes;
+
+        public String getTextValue() {
+            return textValue;
+        }
+
+        public void setTextValue(final String textValue) {
+            this.textValue = textValue;
+        }
+
+        public Map<String, String> getAttributes() {
+            return attributes;
+        }
+
+        public void setAttributes(final Map<String, String> attributes) {
+            this.attributes = attributes;
+        }
+    }
+
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java
@ -1,5 +1,7 @@
 package eu.dnetlib.dhp.utils;

+import com.jayway.jsonpath.JsonPath;
+import net.minidev.json.JSONArray;
 import org.apache.commons.codec.binary.Base64;
 import org.apache.commons.codec.binary.Base64OutputStream;
 import org.apache.commons.codec.binary.Hex;
@ -56,4 +58,17 @@ public class DHPUtils {

    }

+    public static String getJPathString(final String jsonPath, final String json) {
+        try {
+            Object o = JsonPath.read(json, jsonPath);
+            if (o instanceof String)
+                return (String) o;
+            if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
+                return (String) ((JSONArray) o).get(0);
+            return o.toString();
+        } catch (Exception e) {
+            return "";
+        }
+    }
+
 }
--- a/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelInfo.java
+++ b/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelInfo.java
@ -0,0 +1,24 @@
+package eu.dnetlib.scholexplorer.relation;
+
+import java.io.Serializable;
+
+public class RelInfo  implements Serializable {
+    private String original;
+    private String inverse;
+
+    public String getOriginal() {
+        return original;
+    }
+
+    public void setOriginal(String original) {
+        this.original = original;
+    }
+
+    public String getInverse() {
+        return inverse;
+    }
+
+    public void setInverse(String inverse) {
+        this.inverse = inverse;
+    }
+}
--- a/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelationMapper.java
+++ b/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelationMapper.java
@ -0,0 +1,19 @@
+package eu.dnetlib.scholexplorer.relation;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.apache.commons.io.IOUtils;
+
+import java.io.Serializable;
+import java.util.HashMap;
+
+public class RelationMapper extends HashMap<String,RelInfo > implements Serializable {
+
+    public static RelationMapper load() throws Exception {
+
+        final String json = IOUtils.toString(RelationMapper.class.getResourceAsStream("relations.json"));
+
+        ObjectMapper mapper = new ObjectMapper();
+        return mapper.readValue(json, RelationMapper.class);
+    }
+
+}
--- a/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
+++ b/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
@ -0,0 +1,158 @@
+{
+  "cites":{
+    "original":"Cites",
+    "inverse":"IsCitedBy"
+  },
+  "compiles":{
+    "original":"Compiles",
+    "inverse":"IsCompiledBy"
+  },
+  "continues":{
+    "original":"Continues",
+    "inverse":"IsContinuedBy"
+  },
+  "derives":{
+    "original":"IsSourceOf",
+    "inverse":"IsDerivedFrom"
+  },
+  "describes":{
+    "original":"Describes",
+    "inverse":"IsDescribedBy"
+  },
+  "documents":{
+    "original":"Documents",
+    "inverse":"IsDocumentedBy"
+  },
+  "hasmetadata":{
+    "original":"HasMetadata",
+    "inverse":"IsMetadataOf"
+  },
+  "hasassociationwith":{
+    "original":"HasAssociationWith",
+    "inverse":"HasAssociationWith"
+  },
+  "haspart":{
+    "original":"HasPart",
+    "inverse":"IsPartOf"
+  },
+  "hasversion":{
+    "original":"HasVersion",
+    "inverse":"IsVersionOf"
+  },
+  "iscitedby":{
+    "original":"IsCitedBy",
+    "inverse":"Cites"
+  },
+  "iscompiledby":{
+    "original":"IsCompiledBy",
+    "inverse":"Compiles"
+  },
+  "iscontinuedby":{
+    "original":"IsContinuedBy",
+    "inverse":"Continues"
+  },
+  "isderivedfrom":{
+    "original":"IsDerivedFrom",
+    "inverse":"IsSourceOf"
+  },
+  "isdescribedby":{
+    "original":"IsDescribedBy",
+    "inverse":"Describes"
+  },
+  "isdocumentedby":{
+    "original":"IsDocumentedBy",
+    "inverse":"Documents"
+  },
+  "isidenticalto":{
+    "original":"IsIdenticalTo",
+    "inverse":"IsIdenticalTo"
+  },
+  "ismetadatafor":{
+    "original":"IsMetadataFor",
+    "inverse":"IsMetadataOf"
+  },
+  "ismetadataof":{
+    "original":"IsMetadataOf",
+    "inverse":"IsMetadataFor"
+  },
+  "isnewversionof":{
+    "original":"IsNewVersionOf",
+    "inverse":"IsPreviousVersionOf"
+  },
+  "isobsoletedby":{
+    "original":"IsObsoletedBy",
+    "inverse":"Obsoletes"
+  },
+  "isoriginalformof":{
+    "original":"IsOriginalFormOf",
+    "inverse":"IsVariantFormOf"
+  },
+  "ispartof":{
+    "original":"IsPartOf",
+    "inverse":"HasPart"
+  },
+  "ispreviousversionof":{
+    "original":"IsPreviousVersionOf",
+    "inverse":"IsNewVersionOf"
+  },
+  "isreferencedby":{
+    "original":"IsReferencedBy",
+    "inverse":"References"
+  },
+  "isrelatedto":{
+    "original":"IsRelatedTo",
+    "inverse":"IsRelatedTo"
+  },
+  "isrequiredby":{
+    "original":"IsRequiredBy",
+    "inverse":"Requires"
+  },
+  "isreviewedby":{
+    "original":"IsReviewedBy",
+    "inverse":"Reviews"
+  },
+  "issourceof":{
+    "original":"IsSourceOf",
+    "inverse":"IsDerivedFrom"
+  },
+  "issupplementedby":{
+    "original":"IsSupplementedBy",
+    "inverse":"IsSupplementTo"
+  },
+  "issupplementto":{
+    "original":"IsSupplementTo",
+    "inverse":"IsSupplementedBy"
+  },
+  "isvariantformof":{
+    "original":"IsVariantFormOf",
+    "inverse":"IsOriginalFormOf"
+  },
+  "isversionof":{
+    "original":"IsVersionOf",
+    "inverse":"HasVersion"
+  },
+  "obsoletes":{
+    "original":"Obsoletes",
+    "inverse":"IsObsoletedBy"
+  },
+  "references":{
+    "original":"References",
+    "inverse":"IsReferencedBy"
+  },
+  "requires":{
+    "original":"Requires",
+    "inverse":"IsRequiredBy"
+  },
+  "related":{
+    "original":"IsRelatedTo",
+    "inverse":"IsRelatedTo"
+  },
+  "reviews":{
+    "original":"Reviews",
+    "inverse":"IsReviewedBy"
+  },
+  "unknown":{
+    "original":"Unknown",
+    "inverse":"Unknown"
+  }
+}
--- a/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java
@ -0,0 +1,15 @@
+package eu.dnetlib.scholexplorer.relation;
+
+import org.junit.jupiter.api.Test;
+
+
+public class RelationMapperTest {
+
+    @Test
+    public void testLoadRels() throws Exception{
+
+        RelationMapper relationMapper = RelationMapper.load();
+        relationMapper.keySet().forEach(System.out::println);
+
+    }
+}
--- a/dhp-common/src/test/resources/eu/dnetlib/scholexplorer/relation/relations.json
+++ b/dhp-common/src/test/resources/eu/dnetlib/scholexplorer/relation/relations.json
@ -0,0 +1,158 @@
+{
+  "cites":{
+    "original":"Cites",
+    "inverse":"IsCitedBy"
+  },
+  "compiles":{
+    "original":"Compiles",
+    "inverse":"IsCompiledBy"
+  },
+  "continues":{
+    "original":"Continues",
+    "inverse":"IsContinuedBy"
+  },
+  "derives":{
+    "original":"IsSourceOf",
+    "inverse":"IsDerivedFrom"
+  },
+  "describes":{
+    "original":"Describes",
+    "inverse":"IsDescribedBy"
+  },
+  "documents":{
+    "original":"Documents",
+    "inverse":"IsDocumentedBy"
+  },
+  "hasmetadata":{
+    "original":"HasMetadata",
+    "inverse":"IsMetadataOf"
+  },
+  "hasassociationwith":{
+    "original":"HasAssociationWith",
+    "inverse":"HasAssociationWith"
+  },
+  "haspart":{
+    "original":"HasPart",
+    "inverse":"IsPartOf"
+  },
+  "hasversion":{
+    "original":"HasVersion",
+    "inverse":"IsVersionOf"
+  },
+  "iscitedby":{
+    "original":"IsCitedBy",
+    "inverse":"Cites"
+  },
+  "iscompiledby":{
+    "original":"IsCompiledBy",
+    "inverse":"Compiles"
+  },
+  "iscontinuedby":{
+    "original":"IsContinuedBy",
+    "inverse":"Continues"
+  },
+  "isderivedfrom":{
+    "original":"IsDerivedFrom",
+    "inverse":"IsSourceOf"
+  },
+  "isdescribedby":{
+    "original":"IsDescribedBy",
+    "inverse":"Describes"
+  },
+  "isdocumentedby":{
+    "original":"IsDocumentedBy",
+    "inverse":"Documents"
+  },
+  "isidenticalto":{
+    "original":"IsIdenticalTo",
+    "inverse":"IsIdenticalTo"
+  },
+  "ismetadatafor":{
+    "original":"IsMetadataFor",
+    "inverse":"IsMetadataOf"
+  },
+  "ismetadataof":{
+    "original":"IsMetadataOf",
+    "inverse":"IsMetadataFor"
+  },
+  "isnewversionof":{
+    "original":"IsNewVersionOf",
+    "inverse":"IsPreviousVersionOf"
+  },
+  "isobsoletedby":{
+    "original":"IsObsoletedBy",
+    "inverse":"Obsoletes"
+  },
+  "isoriginalformof":{
+    "original":"IsOriginalFormOf",
+    "inverse":"IsVariantFormOf"
+  },
+  "ispartof":{
+    "original":"IsPartOf",
+    "inverse":"HasPart"
+  },
+  "ispreviousversionof":{
+    "original":"IsPreviousVersionOf",
+    "inverse":"IsNewVersionOf"
+  },
+  "isreferencedby":{
+    "original":"IsReferencedBy",
+    "inverse":"References"
+  },
+  "isrelatedto":{
+    "original":"IsRelatedTo",
+    "inverse":"IsRelatedTo"
+  },
+  "isrequiredby":{
+    "original":"IsRequiredBy",
+    "inverse":"Requires"
+  },
+  "isreviewedby":{
+    "original":"IsReviewedBy",
+    "inverse":"Reviews"
+  },
+  "issourceof":{
+    "original":"IsSourceOf",
+    "inverse":"IsDerivedFrom"
+  },
+  "issupplementedby":{
+    "original":"IsSupplementedBy",
+    "inverse":"IsSupplementTo"
+  },
+  "issupplementto":{
+    "original":"IsSupplementTo",
+    "inverse":"IsSupplementedBy"
+  },
+  "isvariantformof":{
+    "original":"IsVariantFormOf",
+    "inverse":"IsOriginalFormOf"
+  },
+  "isversionof":{
+    "original":"IsVersionOf",
+    "inverse":"HasVersion"
+  },
+  "obsoletes":{
+    "original":"Obsoletes",
+    "inverse":"IsObsoletedBy"
+  },
+  "references":{
+    "original":"References",
+    "inverse":"IsReferencedBy"
+  },
+  "requires":{
+    "original":"Requires",
+    "inverse":"IsRequiredBy"
+  },
+  "related":{
+    "original":"IsRelatedTo",
+    "inverse":"IsRelatedTo"
+  },
+  "reviews":{
+    "original":"Reviews",
+    "inverse":"IsReviewedBy"
+  },
+  "unknown":{
+    "original":"Unknown",
+    "inverse":"Unknown"
+  }
+}
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIDataset.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIDataset.java
@ -0,0 +1,80 @@
+package eu.dnetlib.dhp.schema.scholexplorer;
+
+import eu.dnetlib.dhp.schema.oaf.Dataset;
+import eu.dnetlib.dhp.schema.oaf.OafEntity;
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class DLIDataset extends Dataset {
+
+    private String originalObjIdentifier;
+
+    private List<ProvenaceInfo> dlicollectedfrom;
+
+    private String completionStatus;
+
+    public String getCompletionStatus() {
+        return completionStatus;
+    }
+
+    public void setCompletionStatus(String completionStatus) {
+        this.completionStatus = completionStatus;
+    }
+
+    public List<ProvenaceInfo> getDlicollectedfrom() {
+        return dlicollectedfrom;
+    }
+
+    public void setDlicollectedfrom(List<ProvenaceInfo> dlicollectedfrom) {
+        this.dlicollectedfrom = dlicollectedfrom;
+    }
+
+    public String getOriginalObjIdentifier() {
+        return originalObjIdentifier;
+    }
+
+    public void setOriginalObjIdentifier(String originalObjIdentifier) {
+        this.originalObjIdentifier = originalObjIdentifier;
+    }
+
+    @Override
+    public void mergeFrom(OafEntity e) {
+        super.mergeFrom(e);
+        DLIDataset p = (DLIDataset) e;
+        if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus))
+            completionStatus = p.completionStatus;
+        if ("complete".equalsIgnoreCase(p.completionStatus))
+            completionStatus = "complete";
+        dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom());
+    }
+
+    private List<ProvenaceInfo> mergeProvenance(final List<ProvenaceInfo> a, final List<ProvenaceInfo> b) {
+        Map<String, ProvenaceInfo> result = new HashMap<>();
+        if (a != null)
+            a.forEach(p -> {
+                if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
+                    if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
+                        result.put(p.getId(), p);
+                    }
+
+                } else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
+                    result.put(p.getId(), p);
+            });
+        if (b != null)
+            b.forEach(p -> {
+                if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
+                    if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
+                        result.put(p.getId(), p);
+                    }
+
+                } else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
+                    result.put(p.getId(), p);
+            });
+
+        return new ArrayList<>(result.values());
+    }
+}
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIPublication.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIPublication.java
@ -0,0 +1,77 @@
+package eu.dnetlib.dhp.schema.scholexplorer;
+
+import eu.dnetlib.dhp.schema.oaf.OafEntity;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import org.apache.commons.lang3.StringUtils;
+import java.io.Serializable;
+import java.util.*;
+
+public class DLIPublication extends Publication implements Serializable {
+
+    private String originalObjIdentifier;
+
+    private List<ProvenaceInfo> dlicollectedfrom;
+
+    private String completionStatus;
+
+    public String getCompletionStatus() {
+        return completionStatus;
+    }
+
+    public void setCompletionStatus(String completionStatus) {
+        this.completionStatus = completionStatus;
+    }
+
+    public List<ProvenaceInfo> getDlicollectedfrom() {
+        return dlicollectedfrom;
+    }
+
+    public void setDlicollectedfrom(List<ProvenaceInfo> dlicollectedfrom) {
+        this.dlicollectedfrom = dlicollectedfrom;
+    }
+
+    public String getOriginalObjIdentifier() {
+        return originalObjIdentifier;
+    }
+
+    public void setOriginalObjIdentifier(String originalObjIdentifier) {
+        this.originalObjIdentifier = originalObjIdentifier;
+    }
+
+    @Override
+    public void mergeFrom(OafEntity e) {
+        super.mergeFrom(e);
+        DLIPublication p = (DLIPublication) e;
+        if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus))
+            completionStatus = p.completionStatus;
+        if ("complete".equalsIgnoreCase(p.completionStatus))
+            completionStatus = "complete";
+        dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom());
+    }
+
+    private List<ProvenaceInfo> mergeProvenance(final List<ProvenaceInfo> a, final List<ProvenaceInfo> b) {
+        Map<String, ProvenaceInfo> result = new HashMap<>();
+        if (a != null)
+            a.forEach(p -> {
+                if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
+                    if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
+                        result.put(p.getId(), p);
+                    }
+
+                } else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
+                    result.put(p.getId(), p);
+            });
+        if (b != null)
+            b.forEach(p -> {
+                if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
+                    if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
+                        result.put(p.getId(), p);
+                    }
+
+                } else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
+                    result.put(p.getId(), p);
+            });
+
+        return new ArrayList<>(result.values());
+    }
+}
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIUnknown.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIUnknown.java
@ -0,0 +1,108 @@
+package eu.dnetlib.dhp.schema.scholexplorer;
+
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.dhp.schema.oaf.OafEntity;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import org.apache.commons.lang3.StringUtils;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class DLIUnknown extends Oaf implements Serializable {
+
+    private String id;
+
+    private List<StructuredProperty> pid;
+
+    private String dateofcollection;
+
+    private String dateoftransformation;
+
+    private List<ProvenaceInfo> dlicollectedfrom;
+
+    private String completionStatus = "incomplete";
+
+    public String getCompletionStatus() {
+        return completionStatus;
+    }
+
+    public void setCompletionStatus(String completionStatus) {
+        this.completionStatus = completionStatus;
+    }
+
+    public List<ProvenaceInfo> getDlicollectedfrom() {
+        return dlicollectedfrom;
+    }
+
+    public void setDlicollectedfrom(List<ProvenaceInfo> dlicollectedfrom) {
+        this.dlicollectedfrom = dlicollectedfrom;
+    }
+
+    public String getId() {
+        return id;
+    }
+
+    public void setId(String id) {
+        this.id = id;
+    }
+
+
+    public List<StructuredProperty> getPid() {
+        return pid;
+    }
+
+    public void setPid(List<StructuredProperty> pid) {
+        this.pid = pid;
+    }
+
+    public String getDateofcollection() {
+        return dateofcollection;
+    }
+
+    public void setDateofcollection(String dateofcollection) {
+        this.dateofcollection = dateofcollection;
+    }
+
+    public String getDateoftransformation() {
+        return dateoftransformation;
+    }
+
+    public void setDateoftransformation(String dateoftransformation) {
+        this.dateoftransformation = dateoftransformation;
+    }
+
+    public void mergeFrom(DLIUnknown p) {
+        if ("complete".equalsIgnoreCase(p.completionStatus))
+            completionStatus = "complete";
+        dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom());
+    }
+
+    private List<ProvenaceInfo> mergeProvenance(final List<ProvenaceInfo> a, final List<ProvenaceInfo> b) {
+        Map<String, ProvenaceInfo> result = new HashMap<>();
+        if (a != null)
+            a.forEach(p -> {
+                if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
+                    if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
+                        result.put(p.getId(), p);
+                    }
+
+                } else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
+                    result.put(p.getId(), p);
+            });
+        if (b != null)
+            b.forEach(p -> {
+                if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
+                    if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
+                        result.put(p.getId(), p);
+                    }
+
+                } else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
+                    result.put(p.getId(), p);
+            });
+
+        return new ArrayList<>(result.values());
+    }
+}
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/ProvenaceInfo.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/ProvenaceInfo.java
@ -0,0 +1,46 @@
+package eu.dnetlib.dhp.schema.scholexplorer;
+
+import java.io.Serializable;
+
+public class ProvenaceInfo implements Serializable {
+
+    private String id;
+
+    private String name;
+
+    private String completionStatus;
+
+    private String collectionMode ="collected";
+
+    public String getId() {
+        return id;
+    }
+
+    public void setId(String id) {
+        this.id = id;
+    }
+
+    public String getName() {
+        return name;
+    }
+
+    public void setName(String name) {
+        this.name = name;
+    }
+
+    public String getCompletionStatus() {
+        return completionStatus;
+    }
+
+    public void setCompletionStatus(String completionStatus) {
+        this.completionStatus = completionStatus;
+    }
+
+    public String getCollectionMode() {
+        return collectionMode;
+    }
+
+    public void setCollectionMode(String collectionMode) {
+        this.collectionMode = collectionMode;
+    }
+}
--- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java
+++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java
@ -0,0 +1,81 @@
+package eu.dnetlib.dhp.schema.scholexplorer;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.SerializationFeature;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+
+public class DLItest {
+
+
+    @Test
+    public void testMergePublication() throws JsonProcessingException {
+        DLIPublication a1 = new DLIPublication();
+        a1.setPid(Arrays.asList( createSP("123456","pdb","dnet:pid_types")));
+        a1.setTitle(Collections.singletonList(createSP("Un Titolo", "title", "dnetTitle")));
+        a1.setDlicollectedfrom(Arrays.asList(createCollectedFrom("znd","Zenodo","complete")));
+        a1.setCompletionStatus("complete");
+
+        DLIPublication a = new DLIPublication();
+        a.setPid(Arrays.asList(createSP("10.11","doi","dnet:pid_types"), createSP("123456","pdb","dnet:pid_types")));
+        a.setTitle(Collections.singletonList(createSP("A Title", "title", "dnetTitle")));
+        a.setDlicollectedfrom(Arrays.asList(createCollectedFrom("dct","datacite","complete"),createCollectedFrom("dct","datacite","incomplete")));
+        a.setCompletionStatus("incomplete");
+
+        a.mergeFrom(a1);
+
+        ObjectMapper mapper = new ObjectMapper();
+        System.out.println(mapper.writeValueAsString(a));
+
+
+
+
+
+
+
+    }
+
+
+
+    @Test
+    public void testDeserialization() throws IOException {
+
+        final String json ="{\"dataInfo\":{\"invisible\":false,\"inferred\":null,\"deletedbyinference\":false,\"trust\":\"0.9\",\"inferenceprovenance\":null,\"provenanceaction\":null},\"lastupdatetimestamp\":null,\"id\":\"60|bd9352547098929a394655ad1a44a479\",\"originalId\":[\"bd9352547098929a394655ad1a44a479\"],\"collectedfrom\":[{\"key\":\"dli_________::datacite\",\"value\":\"Datasets in Datacite\",\"dataInfo\":null,\"blank\":false}],\"pid\":[{\"value\":\"10.7925/DRS1.DUCHAS_5078760\",\"qualifier\":{\"classid\":\"doi\",\"classname\":\"doi\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\",\"blank\":false},\"dataInfo\":null}],\"dateofcollection\":\"2020-01-09T08:29:31.885Z\",\"dateoftransformation\":null,\"extraInfo\":null,\"oaiprovenance\":null,\"author\":[{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Ireland. Department of Arts, Culture, and the Gaeltacht\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"University College Dublin\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"National Folklore Foundation\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null}],\"resulttype\":null,\"language\":null,\"country\":null,\"subject\":[{\"value\":\"Recreation\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Entertainments and recreational activities\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Siamsaíocht agus caitheamh aimsire\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null}],\"title\":[{\"value\":\"Games We Play\",\"qualifier\":null,\"dataInfo\":null}],\"relevantdate\":[{\"value\":\"1938-09-28\",\"qualifier\":{\"classid\":\"date\",\"classname\":\"date\",\"schemeid\":\"dnet::date\",\"schemename\":\"dnet::date\",\"blank\":false},\"dataInfo\":null}],\"description\":[{\"value\":\"Story collected by Breda Mc Donnell, a student at Tenure school (Tinure, Co. Louth) (no informant identified).\",\"dataInfo\":null}],\"dateofacceptance\":null,\"publisher\":{\"value\":\"University College Dublin\",\"dataInfo\":null},\"embargoenddate\":null,\"source\":null,\"fulltext\":null,\"format\":null,\"contributor\":null,\"resourcetype\":null,\"coverage\":null,\"refereed\":null,\"context\":null,\"processingchargeamount\":null,\"processingchargecurrency\":null,\"externalReference\":null,\"instance\":[],\"storagedate\":null,\"device\":null,\"size\":null,\"version\":null,\"lastmetadataupdate\":null,\"metadataversionnumber\":null,\"geolocation\":null,\"dlicollectedfrom\":[{\"id\":\"dli_________::datacite\",\"name\":\"Datasets in Datacite\",\"completionStatus\":\"complete\",\"collectionMode\":\"resolved\"}],\"completionStatus\":\"complete\"}";
+
+        ObjectMapper mapper = new ObjectMapper();
+        mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
+        DLIDataset dliDataset = mapper.readValue(json, DLIDataset.class);
+        mapper.enable(SerializationFeature.INDENT_OUTPUT);
+        System.out.println(mapper.writeValueAsString(dliDataset));
+    }
+
+    private ProvenaceInfo createCollectedFrom(final String id, final String name, final String completionStatus) {
+        ProvenaceInfo p = new ProvenaceInfo();
+        p.setId(id);
+        p.setName(name);
+        p.setCompletionStatus(completionStatus);
+        return p;
+    }
+
+
+    private StructuredProperty createSP(final String value, final String className, final String schemeName) {
+        StructuredProperty p = new StructuredProperty();
+        p.setValue(value);
+        Qualifier schema = new Qualifier();
+        schema.setClassname(className);
+        schema.setClassid(className);
+        schema.setSchemename(schemeName);
+        schema.setSchemeid(schemeName);
+        p.setQualifier(schema);
+        return p;
+    }
+
+
+}
--- a/dhp-workflows/dhp-aggregation/pom.xml
+++ b/dhp-workflows/dhp-aggregation/pom.xml
@ -105,6 +105,7 @@
 			<artifactId>mongo-java-driver</artifactId>
 		</dependency>

+
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-distcp</artifactId>
--- a/dhp-workflows/dhp-dedup-openaire/pom.xml
+++ b/dhp-workflows/dhp-dedup-openaire/pom.xml
@ -6,9 +6,8 @@
        <version>1.1.6-SNAPSHOT</version>
    </parent>
    <modelVersion>4.0.0</modelVersion>
-
    <artifactId>dhp-dedup-openaire</artifactId>
-    
+
    <build>
    	<plugins>
    	     <plugin>
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.dedup;
+package eu.dnetlib.dhp.oa.dedup;

 import eu.dnetlib.dhp.schema.oaf.Field;
 import org.apache.commons.lang.StringUtils;
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.dedup;
+package eu.dnetlib.dhp.oa.dedup;

 import com.google.common.collect.Lists;
 import eu.dnetlib.dhp.schema.oaf.*;
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.dedup;
+package eu.dnetlib.dhp.oa.dedup;

 import com.google.common.collect.Sets;
 import com.wcohen.ss.JaroWinkler;
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.dedup;
+package eu.dnetlib.dhp.oa.dedup;

 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.model.MapDocument;
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/OafEntityType.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/OafEntityType.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.dedup;
+package eu.dnetlib.dhp.oa.dedup;

 public enum OafEntityType {

--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateConnectedComponent.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateConnectedComponent.java
@ -1,9 +1,9 @@
-package eu.dnetlib.dhp.dedup;
+package eu.dnetlib.dhp.oa.dedup;

 import com.google.common.hash.Hashing;
-import eu.dnetlib.dhp.dedup.graph.ConnectedComponent;
-import eu.dnetlib.dhp.dedup.graph.GraphProcessor;
+import eu.dnetlib.dhp.oa.dedup.graph.ConnectedComponent;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.oa.dedup.graph.GraphProcessor;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.pace.config.DedupConfig;
@ -29,7 +29,9 @@ import java.util.List;
 public class SparkCreateConnectedComponent {

    public static void main(String[] args) throws Exception {
-        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createCC_parameters.json")));
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+                IOUtils.toString(
+                        SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")));
        parser.parseArgument(args);

        new SparkCreateConnectedComponent().run(parser);
@ -94,7 +96,6 @@ public class SparkCreateConnectedComponent {
                .appName(SparkCreateSimRels.class.getSimpleName())
                .master(parser.get("master"))
                .config(conf)
-                .enableHiveSupport()
                .getOrCreate();
    }
 }
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.dedup;
+package eu.dnetlib.dhp.oa.dedup;

 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
@ -15,7 +15,9 @@ import org.dom4j.DocumentException;
 public class SparkCreateDedupRecord {

    public static void main(String[] args) throws Exception {
-        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createDedupRecord_parameters.json")));
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+                IOUtils.toString(
+                        SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json")));
        parser.parseArgument(args);

        new SparkCreateDedupRecord().run(parser);
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.dedup;
+package eu.dnetlib.dhp.oa.dedup;

 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
@ -13,8 +13,6 @@ import org.apache.commons.io.IOUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.compress.GzipCodec;
-import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
@ -32,7 +30,9 @@ public class SparkCreateSimRels implements Serializable {
    private static final Log log = LogFactory.getLog(SparkCreateSimRels.class);

    public static void main(String[] args) throws Exception {
-        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createSimRels_parameters.json")));
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+                IOUtils.toString(
+                        SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json")));
        parser.parseArgument(args);

        new SparkCreateSimRels().run(parser);
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.dedup;
+package eu.dnetlib.dhp.oa.dedup;

 import com.fasterxml.jackson.databind.DeserializationFeature;
 import com.fasterxml.jackson.databind.ObjectMapper;
@ -35,7 +35,9 @@ public class SparkPropagateRelation {
    final static String TARGETJSONPATH = "$.target";

    public static void main(String[] args) throws Exception {
-        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkPropagateRelation.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/propagateRelation_parameters.json")));
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+                IOUtils.toString(
+                        SparkPropagateRelation.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json")));
        parser.parseArgument(args);

        new SparkPropagateRelation().run(parser);
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.dedup;
+package eu.dnetlib.dhp.oa.dedup;

 import eu.dnetlib.pace.util.Reporter;
 import org.apache.commons.logging.Log;
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.dedup;
+package eu.dnetlib.dhp.oa.dedup;

 import com.fasterxml.jackson.databind.DeserializationFeature;
 import com.fasterxml.jackson.databind.ObjectMapper;
@ -28,7 +28,9 @@ public class SparkUpdateEntity implements Serializable {
    final String IDJSONPATH = "$.id";

    public static void main(String[] args) throws Exception {
-        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntity.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/updateEntity_parameters.json")));
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+                IOUtils.toString(
+                        SparkUpdateEntity.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json")));
        parser.parseArgument(args);

        new SparkUpdateEntity().run(parser);
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java
@ -1,7 +1,7 @@
-package eu.dnetlib.dhp.dedup.graph;
+package eu.dnetlib.dhp.oa.dedup.graph;

 import com.fasterxml.jackson.databind.ObjectMapper;
-import eu.dnetlib.dhp.dedup.DedupUtility;
+import eu.dnetlib.dhp.oa.dedup.DedupUtility;
 import eu.dnetlib.pace.util.PaceException;
 import org.apache.commons.lang.StringUtils;
 import org.codehaus.jackson.annotate.JsonIgnore;
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/GraphProcessor.scala
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/GraphProcessor.scala
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.dedup.graph
+package eu.dnetlib.dhp.oa.dedup.graph

 import org.apache.spark.graphx._
 import org.apache.spark.rdd.RDD
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/consistency/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/consistency/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/consistency/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/consistency/oozie_app/workflow.xml
@ -55,7 +55,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Update Entity</name>
-            <class>eu.dnetlib.dhp.dedup.SparkUpdateEntity</class>
+            <class>eu.dnetlib.dhp.oa.dedup.SparkUpdateEntity</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
@ -82,7 +82,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Update Relations</name>
-            <class>eu.dnetlib.dhp.dedup.SparkPropagateRelation</class>
+            <class>eu.dnetlib.dhp.oa.dedup.SparkPropagateRelation</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/dedupRecord_parameters.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/dedupRecord_parameters.json
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml
@ -59,7 +59,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Create Similarity Relations</name>
-            <class>eu.dnetlib.dhp.dedup.SparkCreateSimRels</class>
+            <class>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
@ -86,7 +86,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Create Merge Relations</name>
-            <class>eu.dnetlib.dhp.dedup.SparkCreateConnectedComponent</class>
+            <class>eu.dnetlib.dhp.oa.dedup.SparkCreateConnectedComponent</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
@ -114,7 +114,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Create Dedup Record</name>
-            <class>eu.dnetlib.dhp.dedup.SparkCreateDedupRecord</class>
+            <class>eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord</class>
            <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/dedup/MergeAuthorTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/dedup/MergeAuthorTest.java
@ -1,10 +1,10 @@
-package eu.dnetlib.dhp.dedup;
+package eu.dnetlib.dhp.oa.dedup.dedup;

+import eu.dnetlib.dhp.oa.dedup.DedupUtility;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import org.apache.commons.io.IOUtils;
 import org.codehaus.jackson.map.ObjectMapper;
 import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;

 import java.io.IOException;
 import java.util.Arrays;
@ -30,7 +30,8 @@ public class MergeAuthorTest {
        }).collect(Collectors.toList());
    }

-    @Test
+    //FIX ME Michele DB this tests doesn't work
+    //@Test
    public void test() throws  Exception {
        Publication dedup = new Publication();

--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/dedup/SparkCreateDedupTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/dedup/SparkCreateDedupTest.java
@ -1,8 +1,11 @@
-package eu.dnetlib.dhp.dedup;
+package eu.dnetlib.dhp.oa.dedup.dedup;

 import com.google.common.hash.HashFunction;
 import com.google.common.hash.Hashing;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.oa.dedup.SparkCreateConnectedComponent;
+import eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord;
+import eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Disabled;

--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/dedup/jpath/JsonPathTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/dedup/jpath/JsonPathTest.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.dedup.jpath;
+package eu.dnetlib.dhp.oa.dedup.dedup.jpath;

 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.model.MapDocument;
--- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/authors_merge.json
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/authors_merge.json
--- a/dhp-workflows/dhp-dedup-scholexplorer/pom.xml
+++ b/dhp-workflows/dhp-dedup-scholexplorer/pom.xml
@ -0,0 +1,57 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>dhp-workflows</artifactId>
+        <groupId>eu.dnetlib.dhp</groupId>
+        <version>1.1.6-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <artifactId>dhp-dedup-scholexplorer</artifactId>
+
+    <dependencies>
+
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-core_2.11</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-sql_2.11</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-common</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-schemas</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>eu.dnetlib</groupId>
+            <artifactId>dnet-pace-core</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-graphx_2.11</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-core</artifactId>
+        </dependency>
+
+
+
+    </dependencies>
+
+
+</project>
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkPropagateRelationsJob.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkPropagateRelationsJob.java
@ -0,0 +1,103 @@
+package eu.dnetlib.dedup;
+
+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.DataInfo;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.utils.DHPUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.Optional;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.sql.*;
+import scala.Tuple2;
+
+import java.io.IOException;
+
+public class SparkPropagateRelationsJob {
+    enum FieldType {
+        SOURCE,
+        TARGET
+    }
+    final static String SOURCEJSONPATH = "$.source";
+    final static String TARGETJSONPATH = "$.target";
+
+    public static void main(String[] args) throws Exception {
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkPropagateRelationsJob.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_propagate_relation_parameters.json")));
+        parser.parseArgument(args);
+        final SparkSession spark = SparkSession
+                .builder()
+                .appName(SparkUpdateEntityJob.class.getSimpleName())
+                .master(parser.get("master"))
+                .getOrCreate();
+
+        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+        final String relationPath = parser.get("relationPath");
+        final String mergeRelPath = parser.get("mergeRelPath");
+        final String targetRelPath = parser.get("targetRelPath");
+
+
+        final Dataset<Relation> merge = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class)).where("relClass == 'merges'");
+
+        final Dataset<Relation> rels= spark.read().load(relationPath).as(Encoders.bean(Relation.class));
+
+
+        final Dataset<Relation> firstJoin = rels.joinWith(merge, merge.col("target").equalTo(rels.col("source")), "left_outer")
+                .map((MapFunction<Tuple2<Relation, Relation>, Relation>) r -> {
+                    final Relation mergeRelation = r._2();
+                    final Relation relation = r._1();
+
+                    if(mergeRelation!= null)
+                        relation.setSource(mergeRelation.getSource());
+                    return relation;
+                }, Encoders.bean(Relation.class));
+
+        final Dataset<Relation> secondJoin = firstJoin.joinWith(merge, merge.col("target").equalTo(firstJoin.col("target")), "left_outer")
+                .map((MapFunction<Tuple2<Relation, Relation>, Relation>) r -> {
+                    final Relation mergeRelation = r._2();
+                    final Relation relation = r._1();
+                    if (mergeRelation != null )
+                        relation.setTarget(mergeRelation.getSource());
+                    return relation;
+                }, Encoders.bean(Relation.class));
+
+        secondJoin.write().mode(SaveMode.Overwrite).save(targetRelPath);
+    }
+
+    private static boolean containsDedup(final String json) {
+        final String source = DHPUtils.getJPathString(SOURCEJSONPATH, json);
+        final String target = DHPUtils.getJPathString(TARGETJSONPATH, json);
+
+        return source.toLowerCase().contains("dedup") || target.toLowerCase().contains("dedup");
+    }
+
+
+    private static String replaceField(final String json, final String id, final FieldType type) {
+        ObjectMapper mapper = new ObjectMapper();
+        mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
+        try {
+            Relation relation = mapper.readValue(json, Relation.class);
+            if (relation.getDataInfo() == null)
+                relation.setDataInfo(new DataInfo());
+            relation.getDataInfo().setDeletedbyinference(false);
+            switch (type) {
+                case SOURCE:
+                    relation.setSource(id);
+                    return mapper.writeValueAsString(relation);
+                case TARGET:
+                    relation.setTarget(id);
+                    return mapper.writeValueAsString(relation);
+                default:
+                    throw new IllegalArgumentException("");
+            }
+        } catch (IOException e) {
+            throw new RuntimeException("unable to deserialize json relation: " + json, e);
+        }
+    }
+}
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkUpdateEntityJob.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkUpdateEntityJob.java
@ -0,0 +1,93 @@
+package eu.dnetlib.dedup;
+
+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.DataInfo;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
+import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
+import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
+import eu.dnetlib.dhp.utils.DHPUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.sql.*;
+import scala.Tuple2;
+
+import java.io.IOException;
+
+public class SparkUpdateEntityJob {
+
+    final static String IDJSONPATH = "$.id";
+
+    public static void main(String[] args) throws Exception {
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntityJob.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_delete_by_inference_parameters.json")));
+        parser.parseArgument(args);
+        final SparkSession spark = SparkSession
+                .builder()
+                .appName(SparkUpdateEntityJob.class.getSimpleName())
+                .master(parser.get("master"))
+                .getOrCreate();
+
+        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+        final String entityPath = parser.get("entityPath");
+        final String mergeRelPath = parser.get("mergeRelPath");
+        final String dedupRecordPath = parser.get("dedupRecordPath");
+        final String entity = parser.get("entity");
+        final String destination = parser.get("targetPath");
+
+        final Dataset<Relation> df = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class));
+        final JavaPairRDD<String, String> mergedIds = df
+                .where("relClass == 'merges'")
+                .select(df.col("target"))
+                .distinct()
+                .toJavaRDD()
+                .mapToPair((PairFunction<Row, String, String>) r -> new Tuple2<>(r.getString(0), "d"));
+        final JavaRDD<String> sourceEntity = sc.textFile(entityPath);
+
+        final JavaRDD<String> dedupEntity = sc.textFile(dedupRecordPath);
+            JavaPairRDD<String, String> entitiesWithId = sourceEntity.mapToPair((PairFunction<String, String, String>) s -> new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, s), s));
+            Class<? extends Oaf> mainClass;
+            switch (entity) {
+                case "publication":
+                    mainClass = DLIPublication.class;
+                    break;
+                case "dataset":
+                    mainClass = DLIDataset.class;
+                    break;
+                case "unknown":
+                    mainClass = DLIUnknown.class;
+                    break;
+                default:
+                    throw new IllegalArgumentException("Illegal type " + entity);
+
+            }
+            JavaRDD<String> map = entitiesWithId.leftOuterJoin(mergedIds).map(k -> k._2()._2().isPresent() ? updateDeletedByInference(k._2()._1(), mainClass) : k._2()._1());
+            map.union(dedupEntity).saveAsTextFile(destination, GzipCodec.class);
+
+    }
+
+    private static <T extends Oaf> String updateDeletedByInference(final String json, final Class<T> clazz) {
+        final ObjectMapper mapper = new ObjectMapper();
+        mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
+        try {
+            Oaf entity = mapper.readValue(json, clazz);
+            if (entity.getDataInfo()== null)
+                entity.setDataInfo(new DataInfo());
+            entity.getDataInfo().setDeletedbyinference(true);
+            return mapper.writeValueAsString(entity);
+        } catch (IOException e) {
+            throw new RuntimeException("Unable to convert json", e);
+        }
+
+
+    }
+
+
+}
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/dedup_delete_by_inference_parameters.json
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/dedup_delete_by_inference_parameters.json
@ -0,0 +1,38 @@
+[
+  {
+    "paramName": "mt",
+    "paramLongName": "master",
+    "paramDescription": "should be local or yarn",
+    "paramRequired": true
+  },
+  {
+    "paramName": "ep",
+    "paramLongName": "entityPath",
+    "paramDescription": "the input entity path",
+    "paramRequired": true
+  },
+  {
+    "paramName": "mr",
+    "paramLongName": "mergeRelPath",
+    "paramDescription": "the input path of merge Rel",
+    "paramRequired": true
+  },
+  {
+    "paramName": "dr",
+    "paramLongName": "dedupRecordPath",
+    "paramDescription": "the inputPath of dedup record",
+    "paramRequired": true
+  },
+  {
+    "paramName": "e",
+    "paramLongName": "entity",
+    "paramDescription": "the type of entity",
+    "paramRequired": true
+  },
+  {
+    "paramName": "t",
+    "paramLongName": "targetPath",
+    "paramDescription": "the targetPath",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/dedup_propagate_relation_parameters.json
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/dedup_propagate_relation_parameters.json
@ -0,0 +1,26 @@
+[
+  {
+    "paramName": "mt",
+    "paramLongName": "master",
+    "paramDescription": "should be local or yarn",
+    "paramRequired": true
+  },
+  {
+    "paramName": "ep",
+    "paramLongName": "relationPath",
+    "paramDescription": "the input relation path",
+    "paramRequired": true
+  },
+  {
+    "paramName": "mr",
+    "paramLongName": "mergeRelPath",
+    "paramDescription": "the input path of merge Rel",
+    "paramRequired": true
+  },
+  {
+    "paramName": "t",
+    "paramLongName": "targetRelPath",
+    "paramDescription": "the output Rel Path",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml
@ -0,0 +1,206 @@
+<workflow-app name="Dedup Entities" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>sourcePath</name>
+            <description>the source path</description>
+        </property>
+        <property>
+            <name>entity</name>
+            <description>the entity that should be processed</description>
+        </property>
+        <property>
+            <name>dedupConf</name>
+            <description>the dedup Configuration</description>
+        </property>
+        <property>
+            <name>targetPath</name>
+            <description>the target path</description>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+    </parameters>
+
+    <start to="DeleteWorkingPath"/>
+
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="DeleteWorkingPath">
+        <fs>
+            <delete path='${targetPath}/${entity}'/>
+            <mkdir path="${targetPath}"/>  
+            <mkdir path="${targetPath}/${entity}"/>              
+        </fs>
+        <ok to="CreateSimRels"/>
+       <error to="Kill"/>
+    </action>
+
+    <action name="CreateSimRels">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Create Similarity Relations</name>
+            <class>eu.dnetlib.dedup.SparkCreateSimRels</class>
+            <jar>dhp-dedup-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory ${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                ${sparkExtraOPT}
+            </spark-opts>
+            <arg>-mt</arg><arg>yarn-cluster</arg>
+            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
+            <arg>--targetPath</arg><arg>${targetPath}</arg>
+            <arg>--entity</arg><arg>${entity}</arg>
+            <arg>--dedupConf</arg><arg>${dedupConf}</arg>
+        </spark>
+        <ok to="CreateConnectedComponents"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="CreateConnectedComponents">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Create Connected Components</name>
+            <class>eu.dnetlib.dedup.SparkCreateConnectedComponent</class>
+            <jar>dhp-dedup-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory ${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                ${sparkExtraOPT}
+            </spark-opts>
+            <arg>-mt</arg><arg>yarn-cluster</arg>
+            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
+            <arg>--targetPath</arg><arg>${targetPath}</arg>
+            <arg>--entity</arg><arg>${entity}</arg>
+            <arg>--dedupConf</arg><arg>${dedupConf}</arg>
+        </spark>
+        <ok to="CreateDedupRecord"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="CreateDedupRecord">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Create Dedup Record</name>
+            <class>eu.dnetlib.dedup.SparkCreateDedupRecord</class>
+            <jar>dhp-dedup-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory ${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                ${sparkExtraOPT}
+            </spark-opts>
+            <arg>-mt</arg><arg>yarn-cluster</arg>
+            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
+            <arg>--dedupPath</arg><arg>${targetPath}</arg>
+            <arg>--entity</arg><arg>${entity}</arg>
+            <arg>--dedupConf</arg><arg>${dedupConf}</arg>
+        </spark>
+        <ok to="fixRelation"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="fixRelation">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Propagate Dedup Relations</name>
+            <class>eu.dnetlib.dedup.SparkPropagateRelationsJob</class>
+            <jar>dhp-dedup-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory ${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                ${sparkExtraOPT}
+            </spark-opts>
+            <arg>-mt</arg><arg>yarn-cluster</arg>
+           <arg>--mergeRelPath</arg><arg>${targetPath}/${entity}/mergeRel</arg>
+            <arg>--relationPath</arg><arg>${sourcePath}/relation</arg>
+            <arg>--targetRelPath</arg><arg>${targetPath}/${entity}/updated_relation</arg>
+        </spark>
+        <ok to="updateDeletedByInferenceEntity"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <action name="updateDeletedByInferenceEntity">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Update ${entity} and add DedupRecord</name>
+            <class>eu.dnetlib.dedup.SparkUpdateEntityJob</class>
+            <jar>dhp-dedup-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory ${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                ${sparkExtraOPT}
+            </spark-opts>
+            <arg>-mt</arg><arg>yarn-cluster</arg>
+            <arg>--entityPath</arg><arg>${sourcePath}/${entity}</arg>
+            <arg>--mergeRelPath</arg><arg>${targetPath}/${entity}/mergeRel</arg>
+            <arg>--entity</arg><arg>${entity}</arg>
+            <arg>--dedupRecordPath</arg><arg>${targetPath}/${entity}/dedup_records</arg>
+            <arg>--targetPath</arg><arg>${targetPath}/${entity}/updated_record</arg>
+        </spark>
+        <ok to="replaceEntity"/>
+        <error to="Kill"/>
+    </action>
+
+<!--    <action name="updateDeletedByInferenceRelation">-->
+<!--        <spark xmlns="uri:oozie:spark-action:0.2">-->
+<!--            <job-tracker>${jobTracker}</job-tracker>-->
+<!--            <name-node>${nameNode}</name-node>-->
+<!--            <master>yarn-cluster</master>-->
+<!--            <mode>cluster</mode>-->
+<!--            <name>Update ${entity} set deleted by Inference</name>-->
+<!--            <class>eu.dnetlib.dedup.SparkUpdateEntityJob</class>-->
+<!--            <jar>dhp-dedup-${projectVersion}.jar</jar>-->
+<!--            <spark-opts>-->
+<!--                &#45;&#45;executor-memory ${sparkExecutorMemory}-->
+<!--                &#45;&#45;driver-memory=${sparkDriverMemory}-->
+<!--                ${sparkExtraOPT}-->
+<!--            </spark-opts>-->
+<!--            <arg>-mt</arg><arg>yarn-cluster</arg>-->
+<!--            <arg>&#45;&#45;entityPath</arg><arg>${targetPath}/${entity}/relation_propagated</arg>-->
+<!--            <arg>&#45;&#45;mergeRelPath</arg><arg>${targetPath}/${entity}/mergeRel</arg>-->
+<!--            <arg>&#45;&#45;entity</arg><arg>relation</arg>-->
+<!--            <arg>&#45;&#45;dedupRecordPath</arg><arg>${targetPath}/${entity}/dedup_records</arg>-->
+<!--            <arg>&#45;&#45;targetPath</arg><arg>${targetPath}/${entity}/updated_relation</arg>-->
+<!--        </spark>-->
+<!--        <ok to="End"/>-->
+<!--        <error to="Kill"/>-->
+<!--    </action>-->
+
+
+    <action name="replaceEntity">
+        <fs>
+            <delete path='${sourcePath}/${entity}'/>
+            <delete path='${sourcePath}/relation'/>
+            <move source="${targetPath}/${entity}/updated_relation" target="${sourcePath}/relation" />
+            <move source="${targetPath}/${entity}/updated_record" target="${sourcePath}/${entity}" />
+        </fs>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/conf/pub_scholix.conf.json
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/conf/pub_scholix.conf.json
@ -0,0 +1,378 @@
+{
+  "wf": {
+    "threshold": "0.99",
+    "dedupRun": "001",
+    "entityType": "result",
+    "subEntityType": "resulttype",
+    "subEntityValue": "publication",
+    "orderField": "title",
+    "queueMaxSize": "2000",
+    "groupMaxSize": "100",
+    "maxChildren": "100",
+    "slidingWindowSize": "200",
+    "rootBuilder": [
+    ],
+    "includeChildren": "true",
+    "maxIterations": 20,
+    "idPath": "$.id"
+  },
+  "pace": {
+    "clustering": [
+      {
+        "name": "ngrampairs",
+        "fields": [
+          "title"
+        ],
+        "params": {
+          "max": "1",
+          "ngramLen": "3"
+        }
+      },
+      {
+        "name": "suffixprefix",
+        "fields": [
+          "title"
+        ],
+        "params": {
+          "max": "1",
+          "len": "3"
+        }
+      }
+    ],
+    "decisionTree": {
+      "start": {
+        "fields": [
+          {
+            "field": "pid",
+            "comparator": "jsonListMatch",
+            "weight": 1.0,
+            "countIfUndefined": "false",
+            "params": {
+              "jpath_value": "$.value",
+              "jpath_classid": "$.qualifier.classid"
+            }
+          }
+        ],
+        "threshold": 0.5,
+        "aggregation": "AVG",
+        "positive": "MATCH",
+        "negative": "layer2",
+        "undefined": "layer2",
+        "ignoreUndefined": "true"
+      },
+      "layer2": {
+        "fields": [
+          {
+            "field": "title",
+            "comparator": "titleVersionMatch",
+            "weight": 1.0,
+            "countIfUndefined": "false",
+            "params": {}
+          },
+          {
+            "field": "authors",
+            "comparator": "sizeMatch",
+            "weight": 1.0,
+            "countIfUndefined": "false",
+            "params": {}
+          }
+        ],
+        "threshold": 1.0,
+        "aggregation": "AND",
+        "positive": "layer3",
+        "negative": "NO_MATCH",
+        "undefined": "layer3",
+        "ignoreUndefined": "false"
+      },
+      "layer3": {
+        "fields": [
+          {
+            "field": "title",
+            "comparator": "levensteinTitle",
+            "weight": 1.0,
+            "countIfUndefined": "true",
+            "params": {}
+          }
+        ],
+        "threshold": 0.99,
+        "aggregation": "AVG",
+        "positive": "MATCH",
+        "negative": "NO_MATCH",
+        "undefined": "NO_MATCH",
+        "ignoreUndefined": "true"
+      }
+    },
+    "model": [
+      {
+        "name": "pid",
+        "type": "JSON",
+        "path": "$.pid",
+        "overrideMatch": "true"
+      },
+      {
+        "name": "title",
+        "type": "String",
+        "path": "$.title[*].value",
+        "length": 250,
+        "size": 5
+      },
+      {
+        "name": "authors",
+        "type": "List",
+        "path": "$.author[*].fullname",
+        "size": 200
+      },
+      {
+        "name": "resulttype",
+        "type": "String",
+        "path": "$.resulttype.classid"
+      }
+    ],
+    "blacklists": {
+      "title": [
+        "^Inside Front Cover$",
+        "^CORR Insights$",
+        "^Index des notions$",
+        "^Department of Error.$",
+        "^Untitled Item$",
+        "^Department of Error$",
+        "^Tome II : 1598 à 1605$",
+        "^(à l’exception de roi, prince, royauté, pouvoir, image… qui sont omniprésents)$",
+        "^Museen und Ausstellungsinstitute in Nürnberg$",
+        "^Text/Conference Paper$",
+        "^Table des illustrations$",
+        "^An Intimate Insight on Psychopathy and a Novel Hermeneutic Psychological Science$",
+        "^Index des noms$",
+        "^Reply by Authors.$",
+        "^Titelblatt - Inhalt$",
+        "^Index des œuvres,$",
+        "(?i)^Poster presentations$",
+        "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
+        "^Problems with perinatal pathology\\.?$",
+        "(?i)^Cases? of Puerperal Convulsions$",
+        "(?i)^Operative Gyna?ecology$",
+        "(?i)^Mind the gap\\!?\\:?$",
+        "^Chronic fatigue syndrome\\.?$",
+        "^Cartas? ao editor Letters? to the Editor$",
+        "^Note from the Editor$",
+        "^Anesthesia Abstract$",
+        "^Annual report$",
+        "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
+        "(?i)^Graph and Table of Infectious Diseases?$",
+        "^Presentation$",
+        "(?i)^Reviews and Information on Publications$",
+        "(?i)^PUBLIC HEALTH SERVICES?$",
+        "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
+        "(?i)^Adrese autora$",
+        "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
+        "(?i)^Acknowledgement to Referees$",
+        "(?i)^Behçet's disease\\.?$",
+        "(?i)^Isolation and identification of restriction endonuclease.*$",
+        "(?i)^CEREBROVASCULAR DISEASES?.?$",
+        "(?i)^Screening for abdominal aortic aneurysms?\\.?$",
+        "^Event management$",
+        "(?i)^Breakfast and Crohn's disease.*\\.?$",
+        "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
+        "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
+        "^Gushi hakubutsugaku$",
+        "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
+        "^Intestinal spirocha?etosis$",
+        "^Treatment of Rodent Ulcer$",
+        "(?i)^\\W*Cloud Computing\\W*$",
+        "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
+        "^Free Communications, Poster Presentations: Session [A-F]$",
+        "^“The Historical Aspects? of Quackery\\.?”$",
+        "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
+        "^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
+        "(?i)^Case Report$",
+        "^Boletín Informativo$",
+        "(?i)^Glioblastoma Multiforme$",
+        "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
+        "^Zaměstnanecké výhody$",
+        "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
+        "(?i)^Carotid body tumours?\\.?$",
+        "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
+        "^Avant-propos$",
+        "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
+        "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
+        "(?i)^PUBLIC HEALTH VERSUS THE STATE$",
+        "^Viñetas de Cortázar$",
+        "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
+        "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
+        "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
+        "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
+        "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
+        "^Aus der AGMB$",
+        "^Znanstveno-stručni prilozi$",
+        "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
+        "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
+        "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
+        "^Finanční analýza podniku$",
+        "^Financial analysis( of business)?$",
+        "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
+        "^Jikken nihon shūshinsho$",
+        "(?i)^CORONER('|s)(s|') INQUESTS$",
+        "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
+        "(?i)^Consultants' contract(s)?$",
+        "(?i)^Upute autorima$",
+        "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
+        "^Joshi shin kokubun$",
+        "^Kōtō shōgaku dokuhon nōson'yō$",
+        "^Jinjō shōgaku shōka$",
+        "^Shōgaku shūjichō$",
+        "^Nihon joshi dokuhon$",
+        "^Joshi shin dokuhon$",
+        "^Chūtō kanbun dokuhon$",
+        "^Wabun dokuhon$",
+        "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
+        "(?i)^cardiac rehabilitation$",
+        "(?i)^Analytical summary$",
+        "^Thesaurus resolutionum Sacrae Congregationis Concilii$",
+        "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
+        "^Prikazi i osvrti$",
+        "^Rodinný dům s provozovnou$",
+        "^Family house with an establishment$",
+        "^Shinsei chūtō shin kokugun$",
+        "^Pulmonary alveolar proteinosis(\\.?)$",
+        "^Shinshū kanbun$",
+        "^Viñeta(s?) de Rodríguez$",
+        "(?i)^RUBRIKA UREDNIKA$",
+        "^A Matching Model of the Academic Publication Market$",
+        "^Yōgaku kōyō$",
+        "^Internetový marketing$",
+        "^Internet marketing$",
+        "^Chūtō kokugo dokuhon$",
+        "^Kokugo dokuhon$",
+        "^Antibiotic Cover for Dental Extraction(s?)$",
+        "^Strategie podniku$",
+        "^Strategy of an Enterprise$",
+        "(?i)^respiratory disease(s?)(\\.?)$",
+        "^Award(s?) for Gallantry in Civil Defence$",
+        "^Podniková kultura$",
+        "^Corporate Culture$",
+        "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
+        "^Pracovní motivace$",
+        "^Work Motivation$",
+        "^Kaitei kōtō jogaku dokuhon$",
+        "^Konsolidovaná účetní závěrka$",
+        "^Consolidated Financial Statements$",
+        "(?i)^intracranial tumour(s?)$",
+        "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
+        "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
+        "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
+        "^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
+        "^Úroveň motivačního procesu jako způsobu vedení lidí$",
+        "^The level of motivation process as a leadership$",
+        "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
+        "(?i)^news and events$",
+        "(?i)^NOVOSTI I DOGAĐAJI$",
+        "^Sansū no gakushū$",
+        "^Posouzení informačního systému firmy a návrh změn$",
+        "^Information System Assessment and Proposal for ICT Modification$",
+        "^Stresové zatížení pracovníků ve vybrané profesi$",
+        "^Stress load in a specific job$",
+        "^Sunday: Poster Sessions, Pt.*$",
+        "^Monday: Poster Sessions, Pt.*$",
+        "^Wednesday: Poster Sessions, Pt.*",
+        "^Tuesday: Poster Sessions, Pt.*$",
+        "^Analýza reklamy$",
+        "^Analysis of advertising$",
+        "^Shōgaku shūshinsho$",
+        "^Shōgaku sansū$",
+        "^Shintei joshi kokubun$",
+        "^Taishō joshi kokubun dokuhon$",
+        "^Joshi kokubun$",
+        "^Účetní uzávěrka a účetní závěrka v ČR$",
+        "(?i)^The \"?Causes\"? of Cancer$",
+        "^Normas para la publicación de artículos$",
+        "^Editor('|s)(s|') [Rr]eply$",
+        "^Editor(’|s)(s|’) letter$",
+        "^Redaktoriaus žodis$",
+        "^DISCUSSION ON THE PRECEDING PAPER$",
+        "^Kōtō shōgaku shūshinsho jidōyō$",
+        "^Shōgaku nihon rekishi$",
+        "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
+        "^Préface$",
+        "^Occupational [Hh]ealth [Ss]ervices.$",
+        "^In Memoriam Professor Toshiyuki TAKESHIMA$",
+        "^Účetní závěrka ve vybraném podniku.*$",
+        "^Financial statements in selected company$",
+        "^Abdominal [Aa]ortic [Aa]neurysms.*$",
+        "^Pseudomyxoma peritonei$",
+        "^Kazalo autora$",
+        "(?i)^uvodna riječ$",
+        "^Motivace jako způsob vedení lidí$",
+        "^Motivation as a leadership$",
+        "^Polyfunkční dům$",
+        "^Multi\\-funkcional building$",
+        "^Podnikatelský plán$",
+        "(?i)^Podnikatelský záměr$",
+        "(?i)^Business Plan$",
+        "^Oceňování nemovitostí$",
+        "^Marketingová komunikace$",
+        "^Marketing communication$",
+        "^Sumario Analítico$",
+        "^Riječ uredništva$",
+        "^Savjetovanja i priredbe$",
+        "^Índice$",
+        "^(Starobosanski nadpisi).*$",
+        "^Vzdělávání pracovníků v organizaci$",
+        "^Staff training in organization$",
+        "^(Life Histories of North American Geometridae).*$",
+        "^Strategická analýza podniku$",
+        "^Strategic Analysis of an Enterprise$",
+        "^Sadržaj$",
+        "^Upute suradnicima$",
+        "^Rodinný dům$",
+        "(?i)^Fami(l)?ly house$",
+        "^Upute autorima$",
+        "^Strategic Analysis$",
+        "^Finanční analýza vybraného podniku$",
+        "^Finanční analýza$",
+        "^Riječ urednika$",
+        "(?i)^Content(s?)$",
+        "(?i)^Inhalt$",
+        "^Jinjō shōgaku shūshinsho jidōyō$",
+        "(?i)^Index$",
+        "^Chūgaku kokubun kyōkasho$",
+        "^Retrato de una mujer$",
+        "^Retrato de un hombre$",
+        "^Kōtō shōgaku dokuhon$",
+        "^Shotōka kokugo$",
+        "^Shōgaku dokuhon$",
+        "^Jinjō shōgaku kokugo dokuhon$",
+        "^Shinsei kokugo dokuhon$",
+        "^Teikoku dokuhon$",
+        "^Instructions to Authors$",
+        "^KİTAP TAHLİLİ$",
+        "^PRZEGLĄD PIŚMIENNICTWA$",
+        "(?i)^Presentación$",
+        "^İçindekiler$",
+        "(?i)^Tabl?e of contents$",
+        "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
+        "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
+        "^Editorial( Board)?$",
+        "(?i)^Editorial \\(English\\)$",
+        "^Editörden$",
+        "^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
+        "^(Kiri Karl Morgensternile).*$",
+        "^(\\[Eksliibris Aleksandr).*\\]$",
+        "^(\\[Eksliibris Aleksandr).*$",
+        "^(Eksliibris Aleksandr).*$",
+        "^(Kiri A\\. de Vignolles).*$",
+        "^(2 kirja Karl Morgensternile).*$",
+        "^(Pirita kloostri idaosa arheoloogilised).*$",
+        "^(Kiri tundmatule).*$",
+        "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
+        "^(Eksliibris Nikolai Birukovile).*$",
+        "^(Eksliibris Nikolai Issakovile).*$",
+        "^(WHP Cruise Summary Information of section).*$",
+        "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
+        "^(Measurement of the spin\\-dependent structure function).*",
+        "(?i)^.*authors['’′]? reply\\.?$",
+        "(?i)^.*authors['’′]? response\\.?$"
+      ]
+    },
+    "synonyms": {}
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/derby.log
+++ b/dhp-workflows/dhp-graph-mapper/derby.log
@ -0,0 +1,13 @@
+----------------------------------------------------------------
+Thu Mar 26 19:43:00 CET 2020:
+Booting Derby version The Apache Software Foundation - Apache Derby - 10.12.1.1 - (1704137): instance a816c00e-0171-1827-9724-000012c70f40 
+on database directory /private/var/folders/xn/nr5vdk8n1572rvrnx5890_d80000gn/T/junit3871072562876431144/junit_metastore_db with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@4e6b5ed4 
+Loaded from file:/Users/claudio/.m2/repository/org/apache/derby/derby/10.12.1.1/derby-10.12.1.1.jar
+java.vendor=Oracle Corporation
+java.runtime.version=1.8.0_181-b13
+user.dir=/Users/claudio/workspace/git/dnet-hadoop/dhp-workflows/dhp-graph-mapper
+os.name=Mac OS X
+os.arch=x86_64
+os.version=10.15.3
+derby.system.home=null
+Database Class Loader started - derby.database.classpath=''
--- a/dhp-workflows/dhp-graph-mapper/pom.xml
+++ b/dhp-workflows/dhp-graph-mapper/pom.xml
@ -1,5 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <parent>
        <artifactId>dhp-workflows</artifactId>
        <groupId>eu.dnetlib.dhp</groupId>
@ -11,6 +12,11 @@

    <dependencies>

+        <dependency>
+            <groupId>commons-io</groupId>
+            <artifactId>commons-io</artifactId>
+        </dependency>
+
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
@ -19,6 +25,11 @@
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-hive_2.11</artifactId>
+            <scope>test</scope>
+        </dependency>

        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
@ -30,6 +41,14 @@
            <artifactId>dhp-schemas</artifactId>
            <version>${project.version}</version>
        </dependency>
+        <dependency>
+            <groupId>com.jayway.jsonpath</groupId>
+            <artifactId>json-path</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.mongodb</groupId>
+            <artifactId>mongo-java-driver</artifactId>
+        </dependency>

    </dependencies>

--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/ImportDataFromMongo.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/ImportDataFromMongo.java
@ -0,0 +1,108 @@
+package eu.dnetlib.dhp.graph.scholexplorer;
+
+import com.mongodb.DBObject;
+import com.mongodb.MongoClient;
+import com.mongodb.QueryBuilder;
+import com.mongodb.client.FindIterable;
+import com.mongodb.client.MongoCollection;
+import com.mongodb.client.MongoDatabase;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.bson.Document;
+import org.bson.conversions.Bson;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.Consumer;
+import java.util.stream.Collectors;
+
+public class ImportDataFromMongo {
+
+
+    public static void main(String[] args) throws Exception {
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+                IOUtils.toString(
+                        ImportDataFromMongo.class.getResourceAsStream(
+                                "/eu/dnetlib/dhp/graph/import_from_mongo_parameters.json")));
+        parser.parseArgument(args);
+        final int port = Integer.parseInt(parser.get("dbport"));
+        final String host = parser.get("dbhost");
+
+        final String format = parser.get("format");
+        final String layout = parser.get("layout");
+        final String interpretation = parser.get("interpretation");
+
+        final String dbName = parser.get("dbName");
+
+
+        final MongoClient client = new MongoClient(host, port);
+
+        MongoDatabase database = client.getDatabase(dbName);
+
+        MongoCollection<Document> metadata = database.getCollection("metadata");
+        MongoCollection<Document> metadataManager = database.getCollection("metadataManager");
+        final DBObject query = QueryBuilder.start("format").is(format).and("layout").is(layout).and("interpretation").is(interpretation).get();
+        final List<String> ids = new ArrayList<>();
+        metadata.find((Bson) query).forEach((Consumer<Document>) document -> ids.add(document.getString("mdId")));
+        List<String> databaseId = ids.stream().map(it -> getCurrentId(it, metadataManager)).filter(Objects::nonNull).collect(Collectors.toList());
+        final String hdfsuri = parser.get("namenode");
+        // ====== Init HDFS File System Object
+        Configuration conf = new Configuration();
+        // Set FileSystem URI
+        conf.set("fs.defaultFS", hdfsuri);
+        // Because of Maven
+        conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
+        conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
+
+        System.setProperty("HADOOP_USER_NAME", parser.get("user"));
+        System.setProperty("hadoop.home.dir", "/");
+        FileSystem.get(URI.create(hdfsuri), conf);
+        Path hdfswritepath = new Path(parser.get("targetPath"));
+
+        final AtomicInteger counter = new AtomicInteger(0);
+        try (SequenceFile.Writer writer = SequenceFile.createWriter(conf,
+                SequenceFile.Writer.file(hdfswritepath), SequenceFile.Writer.keyClass(IntWritable.class),
+                SequenceFile.Writer.valueClass(Text.class))) {
+            final IntWritable key = new IntWritable(counter.get());
+            final Text value = new Text();
+            databaseId.forEach(id -> {
+                System.out.println("Reading :"+id);
+                MongoCollection<Document> collection = database.getCollection(id);
+                collection.find().forEach((Consumer<Document>) document ->
+                        {
+                            key.set(counter.getAndIncrement());
+                            value.set(document.getString("body"));
+
+                            if (counter.get() % 10000 == 0) {
+                                System.out.println("Added "+counter.get());
+                            }
+                            try {
+                                writer.append(key, value);
+                            } catch (IOException e) {
+                                throw new RuntimeException(e);
+                            }
+                        }
+
+                );
+            });
+        }
+    }
+
+
+    private static String getCurrentId(final String mdId, final MongoCollection<Document> metadataManager) {
+        FindIterable<Document> result = metadataManager.find((Bson) QueryBuilder.start("mdId").is(mdId).get());
+        final Document item = result.first();
+        return item == null ? null : item.getString("currentId");
+    }
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkExtractEntitiesJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkExtractEntitiesJob.java
@ -0,0 +1,104 @@
+package eu.dnetlib.dhp.graph.scholexplorer;
+
+import com.jayway.jsonpath.JsonPath;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.oa.graph.SparkGraphImporterJob;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
+import net.minidev.json.JSONArray;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+
+public class SparkExtractEntitiesJob {
+    final static String IDJSONPATH = "$.id";
+    final static String SOURCEJSONPATH = "$.source";
+    final static String TARGETJSONPATH = "$.target";
+
+
+    public static void main(String[] args) throws Exception {
+
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+                IOUtils.toString(
+                        SparkExtractEntitiesJob.class.getResourceAsStream(
+                                "/eu/dnetlib/dhp/graph/input_extract_entities_parameters.json")));
+        parser.parseArgument(args);
+        final SparkSession spark = SparkSession
+                .builder()
+                .appName(SparkExtractEntitiesJob.class.getSimpleName())
+                .master(parser.get("master"))
+                .getOrCreate();
+        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+        final String inputPath = parser.get("sourcePath");
+        final String targetPath = parser.get("targetPath");
+        final String tdir =parser.get("targetDir");
+        final JavaRDD<String> inputRDD = sc.textFile(inputPath);
+
+        List<String> entities = Arrays.stream(parser.get("entities").split(",")).map(String::trim).collect(Collectors.toList());
+        if (entities.stream().anyMatch("dataset"::equalsIgnoreCase)) {
+            //Extract Dataset
+            inputRDD.filter(SparkExtractEntitiesJob::isDataset).saveAsTextFile(targetPath + "/dataset/"+tdir, GzipCodec.class);
+        }
+        if (entities.stream().anyMatch("unknown"::equalsIgnoreCase)) {
+            //Extract Unknown
+            inputRDD.filter(SparkExtractEntitiesJob::isUnknown).saveAsTextFile(targetPath + "/unknown/"+tdir, GzipCodec.class);
+        }
+
+        if (entities.stream().anyMatch("relation"::equalsIgnoreCase)) {
+            //Extract Relation
+            inputRDD.filter(SparkExtractEntitiesJob::isRelation).saveAsTextFile(targetPath + "/relation/"+tdir, GzipCodec.class);
+        }
+        if (entities.stream().anyMatch("publication"::equalsIgnoreCase)) {
+            //Extract Relation
+            inputRDD.filter(SparkExtractEntitiesJob::isPublication).saveAsTextFile(targetPath + "/publication/"+tdir, GzipCodec.class);
+        }
+    }
+
+
+    public static boolean isDataset(final String json) {
+        final String id = getJPathString(IDJSONPATH, json);
+        if (StringUtils.isBlank(id)) return false;
+        return id.startsWith("60|");
+    }
+
+
+    public static boolean isPublication(final String json) {
+        final String id = getJPathString(IDJSONPATH, json);
+        if (StringUtils.isBlank(id)) return false;
+        return id.startsWith("50|");
+    }
+
+    public static boolean isUnknown(final String json) {
+        final String id = getJPathString(IDJSONPATH, json);
+        if (StringUtils.isBlank(id)) return false;
+        return id.startsWith("70|");
+    }
+
+    public static boolean isRelation(final String json) {
+        final String source = getJPathString(SOURCEJSONPATH, json);
+        final String target = getJPathString(TARGETJSONPATH, json);
+        return StringUtils.isNotBlank(source) && StringUtils.isNotBlank(target);
+    }
+
+
+    public static String getJPathString(final String jsonPath, final String json) {
+        try {
+            Object o = JsonPath.read(json, jsonPath);
+            if (o instanceof String)
+                return (String) o;
+            if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
+                return (String) ((JSONArray) o).get(0);
+            return "";
+        } catch (Exception e) {
+            return "";
+        }
+    }
+
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGenerateSimRel.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGenerateSimRel.java
@ -0,0 +1,52 @@
+package eu.dnetlib.dhp.graph.scholexplorer;
+
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.utils.DHPUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import scala.Tuple2;
+
+public class SparkScholexplorerGenerateSimRel {
+
+    final static String IDJSONPATH = "$.id";
+    final static String OBJIDPATH = "$.originalObjIdentifier";
+
+
+
+    public static void generateDataFrame(final SparkSession spark, final JavaSparkContext sc, final String inputPath, final String targetPath) {
+
+
+        final JavaPairRDD<String, String> datasetSimRel = sc.textFile(inputPath+"/dataset/*")
+                .mapToPair((PairFunction<String, String, String>) k ->
+                        new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, k),DHPUtils.getJPathString(OBJIDPATH, k)))
+                .filter(t ->
+                        !StringUtils.substringAfter(t._1(), "|")
+                                .equalsIgnoreCase(StringUtils.substringAfter(t._2(), "::")))
+                .distinct();
+
+        final JavaPairRDD<String, String> publicationSimRel = sc.textFile(inputPath+"/publication/*")
+                .mapToPair((PairFunction<String, String, String>) k ->
+                        new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, k),DHPUtils.getJPathString(OBJIDPATH, k)))
+                .filter(t ->
+                        !StringUtils.substringAfter(t._1(), "|")
+                                .equalsIgnoreCase(StringUtils.substringAfter(t._2(), "::")))
+                .distinct();
+
+        JavaRDD<Relation> simRel = datasetSimRel.union(publicationSimRel).map(s -> {
+                    final Relation r = new Relation();
+                    r.setSource(s._1());
+                    r.setTarget(s._2());
+                    r.setRelType("similar");
+                    return r;
+                }
+        );
+        spark.createDataset(simRel.rdd(), Encoders.bean(Relation.class)).distinct().write()
+                .mode(SaveMode.Overwrite).save(targetPath+"/pid_simRel");
+    }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGraphImporter.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGraphImporter.java
@ -0,0 +1,55 @@
+package eu.dnetlib.dhp.graph.scholexplorer;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.graph.scholexplorer.parser.DatasetScholexplorerParser;
+import eu.dnetlib.dhp.graph.scholexplorer.parser.PublicationScholexplorerParser;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.scholexplorer.relation.RelationMapper;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.sql.SparkSession;
+import scala.Tuple2;
+
+public class SparkScholexplorerGraphImporter {
+
+    public static void main(String[] args) throws Exception {
+
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+                IOUtils.toString(
+                        SparkScholexplorerGraphImporter.class.getResourceAsStream(
+                                "/eu/dnetlib/dhp/graph/input_graph_scholix_parameters.json")));
+
+        parser.parseArgument(args);
+        final SparkSession spark = SparkSession
+                .builder()
+                .appName(SparkScholexplorerGraphImporter.class.getSimpleName())
+                .master(parser.get("master"))
+                .getOrCreate();
+        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+        final String inputPath = parser.get("sourcePath");
+
+        RelationMapper relationMapper = RelationMapper.load();
+
+        sc.sequenceFile(inputPath, IntWritable.class, Text.class).map(Tuple2::_2).map(Text::toString).repartition(500)
+                .flatMap((FlatMapFunction<String, Oaf>) record -> {
+                    switch (parser.get("entity")) {
+                        case "dataset":
+                            final DatasetScholexplorerParser d = new DatasetScholexplorerParser();
+                            return d.parseObject(record,relationMapper).iterator();
+                        case "publication":
+                            final PublicationScholexplorerParser p = new PublicationScholexplorerParser();
+                            return p.parseObject(record,relationMapper).iterator();
+                        default:
+                            throw new IllegalArgumentException("wrong values of entities");
+                    }
+                }).map(k -> {
+            ObjectMapper mapper = new ObjectMapper();
+            return mapper.writeValueAsString(k);
+        }).saveAsTextFile(parser.get("targetPath"), GzipCodec.class);
+    }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerMergeEntitiesJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerMergeEntitiesJob.java
@ -0,0 +1,186 @@
+package eu.dnetlib.dhp.graph.scholexplorer;
+
+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.jayway.jsonpath.JsonPath;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
+import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
+import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
+import eu.dnetlib.dhp.utils.DHPUtils;
+import net.minidev.json.JSONArray;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import scala.Tuple2;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+public class SparkScholexplorerMergeEntitiesJob {
+
+    final static String IDJSONPATH = "$.id";
+    final static String SOURCEJSONPATH = "$.source";
+    final static String TARGETJSONPATH = "$.target";
+    final static String RELJSONPATH = "$.relType";
+
+    public static void main(String[] args) throws Exception {
+
+
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+                IOUtils.toString(
+                        SparkScholexplorerMergeEntitiesJob.class.getResourceAsStream(
+                                "/eu/dnetlib/dhp/graph/merge_entities_scholix_parameters.json")));
+        parser.parseArgument(args);
+        final SparkSession spark = SparkSession
+                .builder()
+                .config(new SparkConf()
+                        .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"))
+                .appName(SparkScholexplorerMergeEntitiesJob.class.getSimpleName())
+                .master(parser.get("master"))
+                .getOrCreate();
+        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+        final String inputPath = parser.get("sourcePath");
+        final String targetPath = parser.get("targetPath");
+        final String entity = parser.get("entity");
+
+
+        FileSystem fs = FileSystem.get(sc.sc().hadoopConfiguration());
+        List<Path> subFolder = Arrays.stream(fs.listStatus(new Path(inputPath))).filter(FileStatus::isDirectory).map(FileStatus::getPath).collect(Collectors.toList());
+        List<JavaRDD<String>> inputRdd = new ArrayList<>();
+        subFolder.forEach(p -> inputRdd.add(sc.textFile(p.toUri().getRawPath())));
+        JavaRDD<String> union = sc.emptyRDD();
+        for (JavaRDD<String> item : inputRdd) {
+            union = union.union(item);
+        }
+        switch (entity) {
+            case "dataset":
+                union.mapToPair((PairFunction<String, String, DLIDataset>) f -> {
+                    final String id = getJPathString(IDJSONPATH, f);
+                    ObjectMapper mapper = new ObjectMapper();
+                    mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
+                    return new Tuple2<>(id, mapper.readValue(f, DLIDataset.class));
+                }).reduceByKey((a, b) -> {
+                    a.mergeFrom(b);
+                    return a;
+                }).map(item -> {
+                    ObjectMapper mapper = new ObjectMapper();
+                    return mapper.writeValueAsString(item._2());
+                }).saveAsTextFile(targetPath, GzipCodec.class);
+                break;
+            case "publication":
+                union.mapToPair((PairFunction<String, String, DLIPublication>) f -> {
+                    final String id = getJPathString(IDJSONPATH, f);
+                    ObjectMapper mapper = new ObjectMapper();
+                    mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
+                    return new Tuple2<>(id, mapper.readValue(f, DLIPublication.class));
+                }).reduceByKey((a, b) -> {
+                    a.mergeFrom(b);
+                    return a;
+                }).map(item -> {
+                    ObjectMapper mapper = new ObjectMapper();
+                    return mapper.writeValueAsString(item._2());
+                }).saveAsTextFile(targetPath, GzipCodec.class);
+                break;
+            case "unknown":
+                union.mapToPair((PairFunction<String, String, DLIUnknown>) f -> {
+                    final String id = getJPathString(IDJSONPATH, f);
+                    ObjectMapper mapper = new ObjectMapper();
+                    mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
+                    return new Tuple2<>(id, mapper.readValue(f, DLIUnknown.class));
+                }).reduceByKey((a, b) -> {
+                    a.mergeFrom(b);
+                    return a;
+                }).map(item -> {
+                    ObjectMapper mapper = new ObjectMapper();
+                    return mapper.writeValueAsString(item._2());
+                }).saveAsTextFile(targetPath, GzipCodec.class);
+                break;
+            case "relation":
+
+                SparkScholexplorerGenerateSimRel.generateDataFrame(spark, sc, inputPath.replace("/relation",""),targetPath.replace("/relation","") );
+                RDD<Relation> rdd = union.mapToPair((PairFunction<String, String, Relation>) f -> {
+                    final String source = getJPathString(SOURCEJSONPATH, f);
+                    final String target = getJPathString(TARGETJSONPATH, f);
+                    final String reltype = getJPathString(RELJSONPATH, f);
+                    ObjectMapper mapper = new ObjectMapper();
+                    mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
+                    return new Tuple2<>(DHPUtils.md5(String.format("%s::%s::%s", source.toLowerCase(), reltype.toLowerCase(), target.toLowerCase())), mapper.readValue(f, Relation.class));
+                }).reduceByKey((a, b) -> {
+                    a.mergeFrom(b);
+                    return a;
+                }).map(Tuple2::_2).rdd();
+
+                spark.createDataset(rdd, Encoders.bean(Relation.class)).write().mode(SaveMode.Overwrite).save(targetPath);
+                Dataset<Relation> rel_ds =spark.read().load(targetPath).as(Encoders.bean(Relation.class));
+
+                System.out.println("LOADING PATH :"+targetPath.replace("/relation","")+"/pid_simRel");
+                Dataset<Relation>sim_ds  =spark.read().load(targetPath.replace("/relation","")+"/pid_simRel").as(Encoders.bean(Relation.class));
+
+                TargetFunction tf = new TargetFunction();
+
+                Dataset<Relation> ids = sim_ds.map(tf, Encoders.bean(Relation.class));
+
+
+                final Dataset<Relation> firstJoin = rel_ds
+                        .joinWith(ids, ids.col("target")
+                                .equalTo(rel_ds.col("source")), "left_outer")
+                        .map((MapFunction<Tuple2<Relation, Relation>, Relation>) s ->
+                                {
+                                    if (s._2() != null) {
+                                        s._1().setSource(s._2().getSource());
+                                    }
+                                    return s._1();
+                                }
+                                , Encoders.bean(Relation.class));
+
+
+                Dataset<Relation> secondJoin = firstJoin.joinWith(ids, ids.col("target").equalTo(firstJoin.col("target")),"left_outer")
+                        .map((MapFunction<Tuple2<Relation, Relation>, Relation>) s ->
+                                {
+                                    if (s._2() != null) {
+                                        s._1().setTarget(s._2().getSource());
+                                    }
+                                    return s._1();
+                                }
+                                , Encoders.bean(Relation.class));
+                    secondJoin.write().mode(SaveMode.Overwrite).save(targetPath+"_fixed");
+
+
+                FileSystem fileSystem = FileSystem.get(sc.hadoopConfiguration());
+
+
+                fileSystem.delete(new Path(targetPath), true);
+                fileSystem.rename(new Path(targetPath+"_fixed"),new Path(targetPath));
+
+        }
+    }
+
+    public static String getJPathString(final String jsonPath, final String json) {
+        try {
+            Object o = JsonPath.read(json, jsonPath);
+            if (o instanceof String)
+                return (String) o;
+            if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
+                return (String) ((JSONArray) o).get(0);
+            return "";
+        } catch (Exception e) {
+            return "";
+        }
+    }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/TargetFunction.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/TargetFunction.java
@ -0,0 +1,15 @@
+package eu.dnetlib.dhp.graph.scholexplorer;
+
+
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.spark.api.java.function.MapFunction;
+
+public class TargetFunction implements MapFunction<Relation, Relation> {
+    @Override
+    public Relation call(Relation relation) throws Exception {
+        final String type = StringUtils.substringBefore(relation.getSource(), "|");
+        relation.setTarget(String.format("%s|%s", type, StringUtils.substringAfter(relation.getTarget(),"::")));
+        return relation;
+    }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/parser/AbstractScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/parser/AbstractScholexplorerParser.java
@ -0,0 +1,113 @@
+package eu.dnetlib.dhp.graph.scholexplorer.parser;
+
+
+import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.dhp.utils.DHPUtils;
+import eu.dnetlib.scholexplorer.relation.RelationMapper;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import javax.xml.stream.XMLStreamReader;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public abstract class AbstractScholexplorerParser {
+
+    protected static final Log log = LogFactory.getLog(AbstractScholexplorerParser.class);
+    final static Pattern pattern = Pattern.compile("10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$", Pattern.CASE_INSENSITIVE);
+    private List<String> datasetSubTypes = Arrays.asList("dataset", "software", "film", "sound", "physicalobject", "audiovisual", "collection", "other", "study", "metadata");
+
+    public abstract List<Oaf> parseObject(final String record, final RelationMapper relMapper);
+
+    protected Map<String, String> getAttributes(final XMLStreamReader parser) {
+        final Map<String, String> attributesMap = new HashMap<>();
+        for (int i = 0; i < parser.getAttributeCount(); i++) {
+            attributesMap.put(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
+        }
+        return attributesMap;
+    }
+
+
+    protected List<StructuredProperty> extractSubject(List<VtdUtilityParser.Node> subjects) {
+        final List<StructuredProperty> subjectResult = new ArrayList<>();
+        if (subjects != null && subjects.size() > 0) {
+            subjects.forEach(subjectMap -> {
+                final StructuredProperty subject = new StructuredProperty();
+                subject.setValue(subjectMap.getTextValue());
+                final Qualifier schema = new Qualifier();
+                schema.setClassid("dnet:subject");
+                schema.setClassname("dnet:subject");
+                schema.setSchemeid(subjectMap.getAttributes().get("subjectScheme"));
+                schema.setSchemename(subjectMap.getAttributes().get("subjectScheme"));
+                subject.setQualifier(schema);
+                subjectResult.add(subject);
+            });
+        }
+        return subjectResult;
+    }
+
+
+    protected StructuredProperty extractIdentifier(List<VtdUtilityParser.Node> identifierType, final String fieldName) {
+        final StructuredProperty pid = new StructuredProperty();
+        if (identifierType != null && identifierType.size() > 0) {
+            final VtdUtilityParser.Node result = identifierType.get(0);
+            pid.setValue(result.getTextValue());
+            final Qualifier pidType = new Qualifier();
+            pidType.setClassname(result.getAttributes().get(fieldName));
+            pidType.setClassid(result.getAttributes().get(fieldName));
+            pidType.setSchemename("dnet:pid_types");
+            pidType.setSchemeid("dnet:pid_types");
+            pid.setQualifier(pidType);
+            return pid;
+        }
+        return null;
+    }
+
+    protected void inferPid(final StructuredProperty input) {
+        final Matcher matcher = pattern.matcher(input.getValue());
+        if (matcher.find()) {
+            input.setValue(matcher.group());
+            if (input.getQualifier() == null) {
+                input.setQualifier(new Qualifier());
+                input.getQualifier().setSchemename("dnet:pid_types");
+                input.getQualifier().setSchemeid("dnet:pid_types");
+            }
+            input.getQualifier().setClassid("doi");
+            input.getQualifier().setClassname("doi");
+        }
+    }
+
+    protected String generateId(final String pid, final String pidType, final String entityType) {
+        String type;
+        switch (entityType){
+            case "publication":
+                type = "50|";
+                break;
+            case "dataset":
+                type = "60|";
+                break;
+            case "unknown":
+                type = "70|";
+                break;
+            default:
+                throw new IllegalArgumentException("unexpected value "+entityType);
+
+        }
+        if ("dnet".equalsIgnoreCase(pidType))
+            return type+StringUtils.substringAfter(pid, "::");
+
+        return type+ DHPUtils.md5(String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim()));
+    }
+
+
+
+
+}
+
+
+
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/parser/DatasetScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/parser/DatasetScholexplorerParser.java
@ -0,0 +1,289 @@
+package eu.dnetlib.dhp.graph.scholexplorer.parser;
+
+import com.ximpleware.AutoPilot;
+import com.ximpleware.VTDGen;
+import com.ximpleware.VTDNav;
+import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
+import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
+import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo;
+
+import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node;
+import eu.dnetlib.scholexplorer.relation.RelInfo;
+import eu.dnetlib.scholexplorer.relation.RelationMapper;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.stream.Collectors;
+
+public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
+    @Override
+    public List<Oaf> parseObject(String record, final RelationMapper relationMapper) {
+        try {
+            final DLIDataset parsedObject = new DLIDataset();
+            final VTDGen vg = new VTDGen();
+            vg.setDoc(record.getBytes());
+            final List<Oaf> result = new ArrayList<>();
+            vg.parse(true);
+
+            final VTDNav vn = vg.getNav();
+            final AutoPilot ap = new AutoPilot(vn);
+
+            DataInfo di = new DataInfo();
+            di.setTrust("0.9");
+            di.setDeletedbyinference(false);
+            di.setInvisible(false);
+            parsedObject.setDataInfo(di);
+
+            parsedObject.setOriginalId(Collections.singletonList(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']")));
+
+            parsedObject.setOriginalObjIdentifier(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']"));
+            parsedObject.setDateofcollection(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"));
+
+            final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']");
+
+            if (StringUtils.isNotBlank(resolvedDate)) {
+                StructuredProperty currentDate = new StructuredProperty();
+                currentDate.setValue(resolvedDate);
+                final Qualifier dateQualifier = new Qualifier();
+                dateQualifier.setClassname("resolvedDate");
+                dateQualifier.setClassid("resolvedDate");
+                dateQualifier.setSchemename("dnet::date");
+                dateQualifier.setSchemeid("dnet::date");
+                currentDate.setQualifier(dateQualifier);
+                parsedObject.setRelevantdate(Collections.singletonList(currentDate));
+            }
+
+            final String completionStatus = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']");
+            final String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']");
+
+            final String publisher = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resource']/*[local-name()='publisher']");
+
+            List<VtdUtilityParser.Node> collectedFromNodes =
+                    VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='collectedFrom']", Arrays.asList("name", "id", "mode", "completionStatus"));
+
+            List<VtdUtilityParser.Node> resolvededFromNodes =
+                    VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='resolvedFrom']", Arrays.asList("name", "id", "mode", "completionStatus"));
+
+            Field<String> pf = new Field<>();
+            pf.setValue(publisher);
+
+            parsedObject.setPublisher(pf);
+            final List<ProvenaceInfo> provenances = new ArrayList<>();
+            if (collectedFromNodes != null && collectedFromNodes.size() > 0) {
+                collectedFromNodes.forEach(it -> {
+                    final ProvenaceInfo provenance = new ProvenaceInfo();
+                    provenance.setId(it.getAttributes().get("id"));
+                    provenance.setName(it.getAttributes().get("name"));
+                    provenance.setCollectionMode(provisionMode);
+                    provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
+                    provenances.add(provenance);
+                });
+            }
+
+            if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) {
+                resolvededFromNodes.forEach(it -> {
+                    final ProvenaceInfo provenance = new ProvenaceInfo();
+                    provenance.setId(it.getAttributes().get("id"));
+                    provenance.setName(it.getAttributes().get("name"));
+                    provenance.setCollectionMode("resolved");
+                    provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
+                    provenances.add(provenance);
+                });
+            }
+
+            parsedObject.setDlicollectedfrom(provenances);
+            parsedObject.setCollectedfrom(parsedObject.getDlicollectedfrom().stream().map(
+                    p-> {
+                        final KeyValue cf = new KeyValue();
+                        cf.setKey(p.getId());
+                        cf.setValue(p.getName());
+                        return cf;
+                    }
+            ).collect(Collectors.toList()));
+            parsedObject.setCompletionStatus(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']"));
+
+            final List<Node> identifierType =
+                    VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='resource']/*[local-name()='identifier']", Collections.singletonList("identifierType"));
+
+            StructuredProperty currentPid = extractIdentifier(identifierType, "identifierType");
+            if (currentPid == null) return null;
+            inferPid(currentPid);
+            parsedObject.setPid(Collections.singletonList(currentPid));
+
+
+            final String sourceId = generateId(currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset");
+            parsedObject.setId(sourceId);
+
+
+            List<String> descs = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='description']");
+            if (descs != null && descs.size() > 0)
+                parsedObject.setDescription(descs.stream()
+                        .map(it -> it.length() < 512 ? it : it.substring(0, 512))
+                        .map(it -> {
+                            final Field<String> d = new Field<>();
+                            d.setValue(it);
+                            return d;
+                        })
+                        .collect(Collectors.toList()));
+
+
+            final List<Node> relatedIdentifiers =
+                    VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='relatedIdentifier']",
+                            Arrays.asList("relatedIdentifierType", "relationType", "entityType", "inverseRelationType"));
+
+
+            if(relatedIdentifiers!= null) {
+                result.addAll(relatedIdentifiers.stream()
+                        .flatMap(n -> {
+                            final List<Relation> rels = new ArrayList<>();
+                            Relation r = new Relation();
+                            r.setSource(parsedObject.getId());
+                            final String relatedPid = n.getTextValue();
+                            final String relatedPidType = n.getAttributes().get("relatedIdentifierType");
+                            final String relatedType = n.getAttributes().getOrDefault("entityType", "unknown");
+                            String relationSemantic = n.getAttributes().get("relationType");
+                            String inverseRelation = n.getAttributes().get("inverseRelationType");
+                            final String targetId = generateId(relatedPid, relatedPidType, relatedType);
+
+                            if (relationMapper.containsKey(relationSemantic.toLowerCase()))
+                            {
+                                RelInfo relInfo = relationMapper.get(relationSemantic.toLowerCase());
+                                relationSemantic = relInfo.getOriginal();
+                                inverseRelation = relInfo.getInverse();
+                            }
+                            else {
+                                relationSemantic = "Unknown";
+                                inverseRelation = "Unknown";
+                            }
+                            r.setTarget(targetId);
+                            r.setRelType(relationSemantic);
+                            r.setRelClass("datacite");
+                            r.setCollectedFrom(parsedObject.getCollectedfrom());
+                            r.setDataInfo(di);
+                            rels.add(r);
+                            r = new Relation();
+                            r.setDataInfo(di);
+                            r.setSource(targetId);
+                            r.setTarget(parsedObject.getId());
+                            r.setRelType(inverseRelation);
+                            r.setRelClass("datacite");
+                            r.setCollectedFrom(parsedObject.getCollectedfrom());
+                            rels.add(r);
+                            if("unknown".equalsIgnoreCase(relatedType))
+                                result.add(createUnknownObject(relatedPid, relatedPidType, parsedObject.getCollectedfrom().get(0), di));
+                            return rels.stream();
+                        }).collect(Collectors.toList()));
+            }
+
+
+            final List<Node> hostedBy =
+                    VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name"));
+
+
+            if (hostedBy != null) {
+                parsedObject.setInstance(hostedBy.stream().map(it ->
+                {
+                    final Instance i = new Instance();
+                    i.setUrl(Collections.singletonList(currentPid.getValue()));
+                    KeyValue h = new KeyValue();
+                    i.setHostedby(h);
+                    h.setKey(it.getAttributes().get("id"));
+                    h.setValue(it.getAttributes().get("name"));
+                    return i;
+                }).collect(Collectors.toList()));
+            }
+
+
+            List<StructuredProperty> subjects = extractSubject(VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='resource']//*[local-name()='subject']", Arrays.asList("subjectScheme")));
+
+            parsedObject.setSubject(subjects);
+
+            Qualifier q = new Qualifier();
+            q.setClassname("dataset");
+            q.setClassid("dataset");
+            q.setSchemename("dataset");
+            q.setSchemeid("dataset");
+            parsedObject.setResulttype(q);
+
+            parsedObject.setCompletionStatus(completionStatus);
+
+            final List<String> creators = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='resource']//*[local-name()='creator']/*[local-name()='creatorName']");
+            if (creators != null && creators.size() > 0) {
+                parsedObject.setAuthor(creators
+                        .stream()
+                        .map(a -> {
+                            final Author author = new Author();
+                            author.setFullname(a);
+                            return author;
+                        }).collect(Collectors.toList())
+                );
+            }
+            final List<String> titles = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='resource']//*[local-name()='title']");
+            if (titles != null && titles.size() > 0) {
+                parsedObject.setTitle(titles.stream()
+                        .map(t -> {
+                                    final StructuredProperty st = new StructuredProperty();
+                                    st.setValue(t);
+                                    return st;
+                                }
+                        ).collect(Collectors.toList())
+                );
+            }
+
+            final List<String> dates = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='resource']/*[local-name()='dates']/*[local-name()='date']");
+
+
+            if (dates != null && dates.size() > 0) {
+                parsedObject.setRelevantdate(dates.stream().map(
+                        cd -> {
+                            StructuredProperty date = new StructuredProperty();
+                            date.setValue(cd);
+                            final Qualifier dq = new Qualifier();
+                            dq.setClassname("date");
+                            dq.setClassid("date");
+                            dq.setSchemename("dnet::date");
+                            dq.setSchemeid("dnet::date");
+                            date.setQualifier(dq);
+                            return date;
+                        }
+                ).collect(Collectors.toList()));
+            }
+
+
+
+            result.add(parsedObject);
+            return result;
+        } catch (Throwable e) {
+            log.error("Error on parsing record " + record, e);
+            return null;
+        }
+    }
+
+
+    private DLIUnknown createUnknownObject(final String pid, final String pidType, final KeyValue cf, final DataInfo di) {
+        final DLIUnknown uk = new DLIUnknown();
+        uk.setId(generateId(pid, pidType, "unknown"));
+        ProvenaceInfo pi = new ProvenaceInfo();
+        pi.setId(cf.getKey());
+        pi.setName(cf.getValue());
+        pi.setCompletionStatus("incomplete");
+        uk.setDataInfo(di);
+        uk.setDlicollectedfrom(Collections.singletonList(pi));
+        final StructuredProperty sourcePid = new StructuredProperty();
+        sourcePid.setValue(pid);
+        final Qualifier pt = new Qualifier();
+        pt.setClassname(pidType);
+        pt.setClassid(pidType);
+        pt.setSchemename("dnet:pid_types");
+        pt.setSchemeid("dnet:pid_types");
+        sourcePid.setQualifier(pt);
+        uk.setPid(Collections.singletonList(sourcePid));
+        return uk;
+    }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/parser/PublicationScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/scholexplorer/parser/PublicationScholexplorerParser.java
@ -0,0 +1,252 @@
+package eu.dnetlib.dhp.graph.scholexplorer.parser;
+
+import com.ximpleware.AutoPilot;
+import com.ximpleware.VTDGen;
+import com.ximpleware.VTDNav;
+import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
+import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
+import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo;
+import eu.dnetlib.scholexplorer.relation.RelInfo;
+import eu.dnetlib.scholexplorer.relation.RelationMapper;
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.stream.Collectors;
+
+public class PublicationScholexplorerParser extends AbstractScholexplorerParser {
+
+    @Override
+    public List<Oaf> parseObject(final String record, final RelationMapper relationMapper) {
+        try {
+            final List<Oaf> result = new ArrayList<>();
+            final DLIPublication parsedObject = new DLIPublication();
+            final VTDGen vg = new VTDGen();
+            vg.setDoc(record.getBytes());
+            vg.parse(true);
+
+
+            final VTDNav vn = vg.getNav();
+            final AutoPilot ap = new AutoPilot(vn);
+
+            final DataInfo di = new DataInfo();
+            di.setTrust("0.9");
+            di.setDeletedbyinference(false);
+            di.setInvisible(false);
+
+            parsedObject.setDateofcollection(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"));
+
+            final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']");
+            parsedObject.setOriginalId(Collections.singletonList(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']")));
+
+            if (StringUtils.isNotBlank(resolvedDate)) {
+                StructuredProperty currentDate = new StructuredProperty();
+                currentDate.setValue(resolvedDate);
+                final Qualifier dateQualifier = new Qualifier();
+                dateQualifier.setClassname("resolvedDate");
+                dateQualifier.setClassid("resolvedDate");
+                dateQualifier.setSchemename("dnet::date");
+                dateQualifier.setSchemeid("dnet::date");
+                currentDate.setQualifier(dateQualifier);
+                parsedObject.setRelevantdate(Collections.singletonList(currentDate));
+            }
+
+
+            final List<Node> pid = VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='pid']", Arrays.asList("type"));
+
+            StructuredProperty currentPid = extractIdentifier(pid, "type");
+            if (currentPid == null) return null;
+            inferPid(currentPid);
+            parsedObject.setPid(Collections.singletonList(currentPid));
+            final String sourceId = generateId(currentPid.getValue(), currentPid.getQualifier().getClassid(), "publication");
+            parsedObject.setId(sourceId);
+
+            parsedObject.setOriginalObjIdentifier(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']"));
+
+            String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']");
+
+            List<Node> collectedFromNodes =
+                    VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='collectedFrom']", Arrays.asList("name", "id", "mode", "completionStatus"));
+
+            List<Node> resolvededFromNodes =
+                    VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='resolvedFrom']", Arrays.asList("name", "id", "mode", "completionStatus"));
+
+            final String publisher = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='publisher']");
+            Field<String> pf = new Field<>();
+            pf.setValue(publisher);
+
+            parsedObject.setPublisher(pf);
+            final List<ProvenaceInfo> provenances = new ArrayList<>();
+            if (collectedFromNodes != null && collectedFromNodes.size() > 0) {
+                collectedFromNodes.forEach(it -> {
+                    final ProvenaceInfo provenance = new ProvenaceInfo();
+                    provenance.setId(it.getAttributes().get("id"));
+                    provenance.setName(it.getAttributes().get("name"));
+                    provenance.setCollectionMode(provisionMode);
+                    provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
+                    provenances.add(provenance);
+                });
+            }
+
+            if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) {
+                resolvededFromNodes.forEach(it -> {
+                    final ProvenaceInfo provenance = new ProvenaceInfo();
+                    provenance.setId(it.getAttributes().get("id"));
+                    provenance.setName(it.getAttributes().get("name"));
+                    provenance.setCollectionMode("resolved");
+                    provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
+                    provenances.add(provenance);
+                });
+            }
+
+            parsedObject.setDlicollectedfrom(provenances);
+            parsedObject.setCompletionStatus(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']"));
+
+            parsedObject.setCollectedfrom(parsedObject.getDlicollectedfrom().stream().map(
+                    p -> {
+                        final KeyValue cf = new KeyValue();
+                        cf.setKey(p.getId());
+                        cf.setValue(p.getName());
+                        return cf;
+                    }
+            ).collect(Collectors.toList()));
+
+            final List<Node> relatedIdentifiers =
+                    VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='relatedIdentifier']",
+                            Arrays.asList("relatedIdentifierType", "relationType", "entityType", "inverseRelationType"));
+
+
+            if (relatedIdentifiers != null) {
+                result.addAll(relatedIdentifiers.stream()
+                        .flatMap(n -> {
+                            final List<Relation> rels = new ArrayList<>();
+                            Relation r = new Relation();
+                            r.setSource(parsedObject.getId());
+                            final String relatedPid = n.getTextValue();
+                            final String relatedPidType = n.getAttributes().get("relatedIdentifierType");
+                            final String relatedType = n.getAttributes().getOrDefault("entityType", "unknown");
+                            String relationSemantic = n.getAttributes().get("relationType");
+                            String inverseRelation = "Unknown";
+                            final String targetId = generateId(relatedPid, relatedPidType, relatedType);
+
+                            if (relationMapper.containsKey(relationSemantic.toLowerCase()))
+                            {
+                                RelInfo relInfo = relationMapper.get(relationSemantic.toLowerCase());
+                                relationSemantic = relInfo.getOriginal();
+                                inverseRelation = relInfo.getInverse();
+                            }
+                            else {
+                                relationSemantic = "Unknown";
+                            }
+                            r.setTarget(targetId);
+                            r.setRelType(relationSemantic);
+                            r.setCollectedFrom(parsedObject.getCollectedfrom());
+                            r.setRelClass("datacite");
+                            r.setDataInfo(di);
+                            rels.add(r);
+                            r = new Relation();
+                            r.setDataInfo(di);
+                            r.setSource(targetId);
+                            r.setTarget(parsedObject.getId());
+                            r.setRelType(inverseRelation);
+                            r.setRelClass("datacite");
+                            r.setCollectedFrom(parsedObject.getCollectedfrom());
+                            rels.add(r);
+
+                            return rels.stream();
+                        }).collect(Collectors.toList()));
+            }
+
+            final List<Node> hostedBy =
+                    VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name"));
+
+
+            if (hostedBy != null) {
+                parsedObject.setInstance(hostedBy.stream().map(it ->
+                {
+                    final Instance i = new Instance();
+                    i.setUrl(Collections.singletonList(currentPid.getValue()));
+                    KeyValue h = new KeyValue();
+                    i.setHostedby(h);
+                    h.setKey(it.getAttributes().get("id"));
+                    h.setValue(it.getAttributes().get("name"));
+                    return i;
+                }).collect(Collectors.toList()));
+            }
+
+            final List<String> authorsNode = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='creator']");
+            if (authorsNode != null)
+                parsedObject.setAuthor(authorsNode
+                        .stream()
+                        .map(a -> {
+                            final Author author = new Author();
+                            author.setFullname(a);
+                            return author;
+                        }).collect(Collectors.toList())
+                );
+
+            final List<String> titles = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='title']");
+            if (titles != null) {
+                parsedObject.setTitle(titles.stream()
+                        .map(t -> {
+                                    final StructuredProperty st = new StructuredProperty();
+                                    st.setValue(t);
+                                    return st;
+                                }
+                        ).collect(Collectors.toList())
+                );
+            }
+
+
+            Field<String> description = new Field<>();
+
+            description.setValue(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='description']"));
+
+            if (StringUtils.isNotBlank(description.getValue()) && description.getValue().length() > 512) {
+                description.setValue(description.getValue().substring(0, 512));
+            }
+
+            parsedObject.setDescription(Collections.singletonList(description));
+
+
+            final String cd = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='date']");
+
+            StructuredProperty date = new StructuredProperty();
+            date.setValue(cd);
+            final Qualifier dq = new Qualifier();
+            dq.setClassname("date");
+            dq.setClassid("date");
+            dq.setSchemename("dnet::date");
+            dq.setSchemeid("dnet::date");
+            date.setQualifier(dq);
+            parsedObject.setRelevantdate(Collections.singletonList(date));
+
+            List<StructuredProperty> subjects = extractSubject(VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='subject']", Collections.singletonList("scheme")));
+            parsedObject.setSubject(subjects);
+
+            parsedObject.setDataInfo(di);
+
+            parsedObject.setSubject(subjects);
+            Qualifier q = new Qualifier();
+            q.setClassname("publication");
+            q.setClassid("publication");
+            q.setSchemename("publication");
+            q.setSchemeid("publication");
+            parsedObject.setResulttype(q);
+            result.add(parsedObject);
+            return result;
+
+        } catch (Throwable e) {
+            log.error("Input record: " + record);
+            log.error("Error on parsing record ", e);
+            return null;
+        }
+
+    }
+
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/GraphMappingUtils.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/GraphMappingUtils.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.graph;
+package eu.dnetlib.dhp.oa.graph;

 import java.util.Map;

--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/SparkGraphImporterJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/SparkGraphImporterJob.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.graph;
+package eu.dnetlib.dhp.oa.graph;

 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
@ -15,32 +15,41 @@ public class SparkGraphImporterJob {

        final ArgumentApplicationParser parser = new ArgumentApplicationParser(
                IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream(
-                        "/eu/dnetlib/dhp/graph/input_graph_parameters.json")));
+                        "/eu/dnetlib/dhp/oa/graph/input_graph_parameters.json")));
        parser.parseArgument(args);

+        new SparkGraphImporterJob().run(parser);
+    }
+
+    private void run(ArgumentApplicationParser parser) {
        try(SparkSession spark = getSparkSession(parser)) {

-            final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
            final String inputPath = parser.get("sourcePath");
            final String hiveDbName = parser.get("hive_db_name");

-            spark.sql(String.format("DROP DATABASE IF EXISTS %s CASCADE", hiveDbName));
-            spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName));
-
-            // Read the input file and convert it into RDD of serializable object
-            GraphMappingUtils.types.forEach((name, clazz) -> spark.createDataset(sc.textFile(inputPath + "/" + name)
-                    .map(s -> new ObjectMapper().readValue(s, clazz))
-                    .rdd(), Encoders.bean(clazz))
-                    .write()
-                    .mode(SaveMode.Overwrite)
-                    .saveAsTable(hiveDbName + "." + name));
+            runWith(spark, inputPath, hiveDbName);
        }
    }

+    // public for testing
+    public void runWith(SparkSession spark, String inputPath, String hiveDbName) {
+
+        spark.sql(String.format("DROP DATABASE IF EXISTS %s CASCADE", hiveDbName));
+        spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName));
+
+        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+        // Read the input file and convert it into RDD of serializable object
+        GraphMappingUtils.types.forEach((name, clazz) -> spark.createDataset(sc.textFile(inputPath + "/" + name)
+                .map(s -> new ObjectMapper().readValue(s, clazz))
+                .rdd(), Encoders.bean(clazz))
+                .write()
+                .mode(SaveMode.Overwrite)
+                .saveAsTable(hiveDbName + "." + name));
+    }
+
    private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
        SparkConf conf = new SparkConf();
        conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
-
        return SparkSession
                .builder()
                .appName(SparkGraphImporterJob.class.getSimpleName())
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/convertXmlToEntities/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/convertXmlToEntities/oozie_app/config-default.xml
@ -0,0 +1,10 @@
+<configuration>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/convertXmlToEntities/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/convertXmlToEntities/oozie_app/workflow.xml
@ -0,0 +1,90 @@
+<workflow-app name="import_infospace_graph" xmlns="uri:oozie:workflow:0.5">
+
+    <parameters>
+        <property>
+            <name>sourcePath</name>
+            <description>the source path</description>
+        </property>
+        <property>
+            <name>hive_db_name</name>
+            <description>the target hive database name</description>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+    </parameters>
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>mapreduce.job.queuename</name>
+                <value>${queueName}</value>
+            </property>
+            <property>
+                <name>oozie.launcher.mapred.job.queue.name</name>
+                <value>${oozieLauncherQueueName}</value>
+            </property>
+        </configuration>
+    </global>
+
+    <start to="MapGraphAsHiveDB"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="MapGraphAsHiveDB">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>MapGraphAsHiveDB</name>
+            <class>eu.dnetlib.dhp.oa.graph.SparkGraphImporterJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory ${sparkExecutorMemory}
+                --executor-cores ${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
+                --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
+                --conf spark.sql.warehouse.dir="/user/hive/warehouse"
+            </spark-opts>
+            <arg>-mt</arg> <arg>yarn</arg>
+            <arg>-s</arg><arg>${sourcePath}</arg>
+            <arg>-db</arg><arg>${hive_db_name}</arg>
+            <arg>-h</arg><arg>${hive_metastore_uris}</arg>
+        </spark>
+        <ok to="PostProcessing"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="PostProcessing">
+        <hive2 xmlns="uri:oozie:hive2-action:0.1">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <configuration>
+                <property>
+                    <name>hive.metastore.uris</name>
+                    <value>${hive_metastore_uris}</value>
+                </property>
+            </configuration>
+            <jdbc-url>${hive_jdbc_url}/${hive_db_name}</jdbc-url>
+            <script>lib/scripts/postprocessing.sql</script>
+            <param>hive_db_name=${hive_db_name}</param>
+        </hive2>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/extractEntities/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/extractEntities/oozie_app/config-default.xml
@ -0,0 +1,10 @@
+<configuration>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/extractEntities/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/extractEntities/oozie_app/workflow.xml
@ -0,0 +1,75 @@
+<workflow-app name="import_infospace_graph" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>sourcePath</name>
+            <description>the source path</description>
+        </property>
+        <property>
+            <name>targetPath</name>
+            <description>the source path</description>
+        </property>
+        <property>
+            <name>targetDir</name>
+            <description>the name of the  path</description>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>entities</name>
+            <description>the entities to be extracted</description>
+        </property>
+    </parameters>
+
+    <start to="DeleteTargetPath"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    <action name="DeleteTargetPath">
+        <fs>
+            <mkdir path="${targetPath}"/>  
+            <mkdir path="${targetPath}/dataset"/>
+            <mkdir path="${targetPath}/publication"/>
+            <mkdir path="${targetPath}/unknown"/>
+            <mkdir path="${targetPath}/relation"/>            
+            <delete path='${targetPath}/dataset/${targetDir}'/>
+            <delete path='${targetPath}/publication/${targetDir}'/>
+            <delete path='${targetPath}/unknown/${targetDir}'/>
+            <delete path='${targetPath}/relation/${targetDir}'/>
+        </fs>
+        <ok to="ExtractDLIEntities"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="ExtractDLIEntities">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Extract ${entities}</name>
+            <class>eu.dnetlib.dhp.graph.scholexplorer.SparkExtractEntitiesJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory ${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                ${sparkExtraOPT}
+            </spark-opts>
+            <arg>-mt</arg> <arg>yarn-cluster</arg>
+            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
+            <arg>--targetPath</arg><arg>${targetPath}</arg>
+            <arg>--targetDir</arg><arg>${targetDir}</arg>
+            <arg>--entities</arg><arg>${entities}</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/generate_sim_rel_scholix_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/generate_sim_rel_scholix_parameters.json
@ -0,0 +1,5 @@
+[
+  {"paramName":"mt",  "paramLongName":"master",             "paramDescription": "should be local or yarn",                                  "paramRequired": true},
+  {"paramName":"s",   "paramLongName":"sourcePath",         "paramDescription": "the path of the sequencial file to read",                  "paramRequired": true},
+  {"paramName":"t",   "paramLongName":"targetPath",         "paramDescription": "the path of the result data",                              "paramRequired": true}
+]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/importMongoDbToHdfs/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/importMongoDbToHdfs/oozie_app/config-default.xml
@ -0,0 +1,10 @@
+<configuration>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/importMongoDbToHdfs/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/importMongoDbToHdfs/oozie_app/workflow.xml
@ -0,0 +1,73 @@
+<workflow-app name="import Entities from aggretor to HDFS" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>workingPath</name>
+            <description>the working dir base path</description>
+        </property>
+        <property>
+            <name>targetPath</name>
+            <description>the graph Raw base path</description>
+        </property>
+        <property>
+            <name>format</name>
+            <description>the postgres URL to access to the database</description>
+        </property>
+        <property>
+            <name>layout</name>
+            <description>the user postgres</description>
+        </property>
+        <property>
+            <name>interpretation</name>
+            <description>the password postgres</description>
+        </property>
+        <property>
+            <name>dbhost</name>
+            <description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
+        </property>
+        <property>
+            <name>dbName</name>
+            <description>mongo database</description>
+        </property>
+        <property>
+            <name>user</name>
+            <description>HDFS user</description>
+        </property>
+    </parameters>
+
+    <start to="ResetWorkingPath"/>
+
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="ResetWorkingPath">
+        <fs>
+            <delete path='${targetPath}'/>
+            <mkdir path='${workingPath}'/>
+        </fs>
+        <ok to="ImportEntitiesFromMongo"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <action name="ImportEntitiesFromMongo">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.dhp.graph.scholexplorer.ImportDataFromMongo</main-class>
+            <arg>-t</arg><arg>${targetPath}</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-u</arg><arg>${user}</arg>
+            <arg>-h</arg><arg>${dbhost}</arg>
+            <arg>-p</arg><arg>27017</arg>
+            <arg>-dn</arg><arg>${dbName}</arg>
+            <arg>-f</arg><arg>${format}</arg>
+            <arg>-l</arg><arg>${layout}</arg>
+            <arg>-i</arg><arg>${interpretation}</arg>
+        </java>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/import_from_mongo_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/import_from_mongo_parameters.json
@ -0,0 +1,12 @@
+[
+  {"paramName":"n",  "paramLongName":"namenode",          "paramDescription": "the name node",                         "paramRequired": true},
+  {"paramName":"u",  "paramLongName":"user",          "paramDescription": "the name node",                         "paramRequired": true},
+  {"paramName":"t",  "paramLongName":"targetPath",          "paramDescription": "the name node",                         "paramRequired": true},
+  {"paramName":"h",  "paramLongName":"dbhost",            "paramDescription": "the mongo host",                         "paramRequired": true},
+  {"paramName":"p",   "paramLongName":"dbport",           "paramDescription": "the mongo port",                         "paramRequired": true},
+  {"paramName":"f",   "paramLongName":"format",           "paramDescription": "the metadata format to import",          "paramRequired": true},
+  {"paramName":"l",   "paramLongName":"layout",           "paramDescription": "the metadata layout to import",          "paramRequired": true},
+  {"paramName":"i",   "paramLongName":"interpretation",   "paramDescription": "the metadata interpretation to import",  "paramRequired": true},
+  {"paramName":"dn",   "paramLongName":"dbName",          "paramDescription": "the database Name",                      "paramRequired": true}
+
+]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/input_extract_entities_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/input_extract_entities_parameters.json
@ -0,0 +1,7 @@
+[
+  {"paramName":"mt",  "paramLongName":"master",               "paramDescription": "should be local or yarn",                                  "paramRequired": true},
+  {"paramName":"s",   "paramLongName":"sourcePath",           "paramDescription": "the path of the sequencial file to read",                  "paramRequired": true},
+  {"paramName":"t",   "paramLongName":"targetPath",           "paramDescription": "the path of the result data",                              "paramRequired": true},
+  {"paramName":"td",   "paramLongName":"targetDir",         "paramDescription": "the name of the result data",                              "paramRequired": true},
+  {"paramName":"e",   "paramLongName":"entities",             "paramDescription": "the entity type to be filtered",                           "paramRequired": true}
+]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/input_graph_scholix_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/input_graph_scholix_parameters.json
@ -1,6 +1,6 @@
 [
  {"paramName":"mt",  "paramLongName":"master",             "paramDescription": "should be local or yarn",                                  "paramRequired": true},
  {"paramName":"s",   "paramLongName":"sourcePath",         "paramDescription": "the path of the sequencial file to read",                  "paramRequired": true},
-  {"paramName":"h",   "paramLongName":"hive_metastore_uris","paramDescription": "the hive metastore uris",                                  "paramRequired": true},
-  {"paramName":"db",  "paramLongName":"hive_db_name",       "paramDescription": "the target hive database name",                            "paramRequired": true}
+  {"paramName":"t",   "paramLongName":"targetPath",         "paramDescription": "the path of the result data",                              "paramRequired": true},
+  {"paramName":"e",   "paramLongName":"entity",             "paramDescription": "the entity type",                                          "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/mergeEntities/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/mergeEntities/oozie_app/config-default.xml
@ -0,0 +1,10 @@
+<configuration>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/mergeEntities/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/mergeEntities/oozie_app/workflow.xml
@ -0,0 +1,61 @@
+<workflow-app name="Infospace Merge Entities" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>sourcePath</name>
+            <description>the source path</description>
+        </property>
+        <property>
+            <name>targetPath</name>
+            <description>the source path</description>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>entity</name>
+            <description>the entity to be merged</description>
+        </property>
+    </parameters>
+
+    <start to="DeleteTargetPath"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="DeleteTargetPath">
+        <fs>
+            <mkdir path="${targetPath}"/>  
+                      
+            <delete path='${targetPath}/${entity}'/>            
+        </fs>
+        <ok to="MergeDLIEntities"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="MergeDLIEntities">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Merge ${entity}</name>
+            <class>eu.dnetlib.dhp.graph.scholexplorer.SparkScholexplorerMergeEntitiesJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>  --executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
+            <arg>-mt</arg> <arg>yarn-cluster</arg>
+            <arg>--sourcePath</arg><arg>${sourcePath}/${entity}</arg>
+            <arg>--targetPath</arg><arg>${targetPath}/${entity}</arg>
+            <arg>--entity</arg><arg>${entity}</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/merge_entities_scholix_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/merge_entities_scholix_parameters.json
@ -0,0 +1,6 @@
+[
+  {"paramName":"mt",  "paramLongName":"master",             "paramDescription": "should be local or yarn",                                  "paramRequired": true},
+  {"paramName":"s",   "paramLongName":"sourcePath",         "paramDescription": "the path of the sequencial file to read",                  "paramRequired": true},
+  {"paramName":"e",   "paramLongName":"entity",             "paramDescription": "the entity type",                                          "paramRequired": true},
+  {"paramName":"t",   "paramLongName":"targetPath",         "paramDescription": "the path of the result data",                              "paramRequired": true}
+]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/relations.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/scholexplorer/relations.json
@ -0,0 +1,158 @@
+{
+  "cites":{
+    "original":"Cites",
+    "inverse":"IsCitedBy"
+  },
+  "compiles":{
+    "original":"Compiles",
+    "inverse":"IsCompiledBy"
+  },
+  "continues":{
+    "original":"Continues",
+    "inverse":"IsContinuedBy"
+  },
+  "derives":{
+    "original":"IsSourceOf",
+    "inverse":"IsDerivedFrom"
+  },
+  "describes":{
+    "original":"Describes",
+    "inverse":"IsDescribedBy"
+  },
+  "documents":{
+    "original":"Documents",
+    "inverse":"IsDocumentedBy"
+  },
+  "hasmetadata":{
+    "original":"HasMetadata",
+    "inverse":"IsMetadataOf"
+  },
+  "hasassociationwith":{
+    "original":"HasAssociationWith",
+    "inverse":"HasAssociationWith"
+  },
+  "haspart":{
+    "original":"HasPart",
+    "inverse":"IsPartOf"
+  },
+  "hasversion":{
+    "original":"HasVersion",
+    "inverse":"IsVersionOf"
+  },
+  "iscitedby":{
+    "original":"IsCitedBy",
+    "inverse":"Cites"
+  },
+  "iscompiledby":{
+    "original":"IsCompiledBy",
+    "inverse":"Compiles"
+  },
+  "iscontinuedby":{
+    "original":"IsContinuedBy",
+    "inverse":"Continues"
+  },
+  "isderivedfrom":{
+    "original":"IsDerivedFrom",
+    "inverse":"IsSourceOf"
+  },
+  "isdescribedby":{
+    "original":"IsDescribedBy",
+    "inverse":"Describes"
+  },
+  "isdocumentedby":{
+    "original":"IsDocumentedBy",
+    "inverse":"Documents"
+  },
+  "isidenticalto":{
+    "original":"IsIdenticalTo",
+    "inverse":"IsIdenticalTo"
+  },
+  "ismetadatafor":{
+    "original":"IsMetadataFor",
+    "inverse":"IsMetadataOf"
+  },
+  "ismetadataof":{
+    "original":"IsMetadataOf",
+    "inverse":"IsMetadataFor"
+  },
+  "isnewversionof":{
+    "original":"IsNewVersionOf",
+    "inverse":"IsPreviousVersionOf"
+  },
+  "isobsoletedby":{
+    "original":"IsObsoletedBy",
+    "inverse":"Obsoletes"
+  },
+  "isoriginalformof":{
+    "original":"IsOriginalFormOf",
+    "inverse":"IsVariantFormOf"
+  },
+  "ispartof":{
+    "original":"IsPartOf",
+    "inverse":"HasPart"
+  },
+  "ispreviousversionof":{
+    "original":"IsPreviousVersionOf",
+    "inverse":"IsNewVersionOf"
+  },
+  "isreferencedby":{
+    "original":"IsReferencedBy",
+    "inverse":"References"
+  },
+  "isrelatedto":{
+    "original":"IsRelatedTo",
+    "inverse":"IsRelatedTo"
+  },
+  "isrequiredby":{
+    "original":"IsRequiredBy",
+    "inverse":"Requires"
+  },
+  "isreviewedby":{
+    "original":"IsReviewedBy",
+    "inverse":"Reviews"
+  },
+  "issourceof":{
+    "original":"IsSourceOf",
+    "inverse":"IsDerivedFrom"
+  },
+  "issupplementedby":{
+    "original":"IsSupplementedBy",
+    "inverse":"IsSupplementTo"
+  },
+  "issupplementto":{
+    "original":"IsSupplementTo",
+    "inverse":"IsSupplementedBy"
+  },
+  "isvariantformof":{
+    "original":"IsVariantFormOf",
+    "inverse":"IsOriginalFormOf"
+  },
+  "isversionof":{
+    "original":"IsVersionOf",
+    "inverse":"HasVersion"
+  },
+  "obsoletes":{
+    "original":"Obsoletes",
+    "inverse":"IsObsoletedBy"
+  },
+  "references":{
+    "original":"References",
+    "inverse":"IsReferencedBy"
+  },
+  "requires":{
+    "original":"Requires",
+    "inverse":"IsRequiredBy"
+  },
+  "related":{
+    "original":"IsRelatedTo",
+    "inverse":"IsRelatedTo"
+  },
+  "reviews":{
+    "original":"Reviews",
+    "inverse":"IsReviewedBy"
+  },
+  "unknown":{
+    "original":"Unknown",
+    "inverse":"Unknown"
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/input_graph_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/input_graph_parameters.json
@ -0,0 +1,6 @@
+[
+  {"paramName":"mt",  "paramLongName":"master",             "paramDescription": "should be local or yarn",                  "paramRequired": true},
+  {"paramName":"s",   "paramLongName":"sourcePath",         "paramDescription": "the path of the sequencial file to read",  "paramRequired": true},
+  {"paramName":"h",   "paramLongName":"hive_metastore_uris","paramDescription": "the hive metastore uris",                  "paramRequired": true},
+  {"paramName":"db",  "paramLongName":"hive_db_name",       "paramDescription": "the target hive database name",            "paramRequired": true}
+]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/oozie_app/lib/scripts/postprocessing.sql
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/oozie_app/lib/scripts/postprocessing.sql
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/oozie_app/workflow.xml
@ -1,4 +1,4 @@
-<workflow-app name="import_infospace_graph" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="import_graph_as_hive_DB" xmlns="uri:oozie:workflow:0.5">

    <parameters>
        <property>
@ -49,7 +49,7 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>MapGraphAsHiveDB</name>
-            <class>eu.dnetlib.dhp.graph.SparkGraphImporterJob</class>
+            <class>eu.dnetlib.dhp.oa.graph.SparkGraphImporterJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/SparkGraphImporterJobTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/SparkGraphImporterJobTest.java
@ -1,52 +0,0 @@
-package eu.dnetlib.dhp.graph;
-
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SparkSession;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.Disabled;
-import org.junit.jupiter.api.Test;
-import org.junit.jupiter.api.io.TempDir;
-import scala.Tuple2;
-
-import java.nio.file.Path;
-import java.util.List;
-import java.util.stream.Collectors;
-
-public class SparkGraphImporterJobTest {
-
-    private static final long MAX = 1000L;
-
-    @Disabled("must be parametrized to run locally")
-    public void testImport(@TempDir Path outPath) throws Exception {
-        SparkGraphImporterJob.main(new String[] {
-                "-mt", "local[*]",
-                "-s", getClass().getResource("/eu/dnetlib/dhp/graph/sample").getPath(),
-                "-h", "",
-                "-db", "test"
-        });
-
-        countEntities(outPath.toString()).forEach(t -> {
-            System.out.println(t);
-            Assertions.assertEquals(MAX, t._2().longValue(), String.format("mapped %s must be %s", t._1(), MAX));
-        });
-    }
-
-    public static List<Tuple2<String, Long>> countEntities(final String inputPath) {
-
-        final SparkSession spark = SparkSession
-                .builder()
-                .appName(SparkGraphImporterJobTest.class.getSimpleName())
-                .master("local[*]")
-                .getOrCreate();
-        //final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-
-        return GraphMappingUtils.types.entrySet()
-                .stream()
-                .map(entry -> {
-                    final Long count = spark.read().load(inputPath + "/" + entry.getKey()).as(Encoders.bean(entry.getValue())).count();
-                    return new Tuple2<String, Long>(entry.getKey(), count);
-                })
-                .collect(Collectors.toList());
-    }
-}
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/ScholexplorerParserTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/ScholexplorerParserTest.java
@ -0,0 +1,38 @@
+package eu.dnetlib.dhp.graph.scholexplorer;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.SerializationFeature;
+import eu.dnetlib.dhp.graph.scholexplorer.parser.DatasetScholexplorerParser;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.scholexplorer.relation.RelationMapper;
+import org.apache.commons.io.IOUtils;
+import org.junit.jupiter.api.Test;
+
+import java.util.List;
+
+public class ScholexplorerParserTest {
+
+
+    @Test
+    public void testDataciteParser() throws Exception {
+        String xml = IOUtils.toString(this.getClass().getResourceAsStream("dmf.xml"));
+
+        DatasetScholexplorerParser p = new DatasetScholexplorerParser();
+        List<Oaf> oaves = p.parseObject(xml, RelationMapper.load());
+
+        ObjectMapper m = new ObjectMapper();
+        m.enable(SerializationFeature.INDENT_OUTPUT);
+
+
+        oaves.forEach(oaf -> {
+            try {
+                System.out.println(m.writeValueAsString(oaf));
+                System.out.println("----------------------------");
+            } catch (JsonProcessingException e) {
+
+            }
+        });
+
+    }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGraphImporterTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGraphImporterTest.java
@ -0,0 +1,11 @@
+package eu.dnetlib.dhp.graph.scholexplorer;
+
+
+
+
+public class SparkScholexplorerGraphImporterTest {
+
+
+
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerMergeEntitiesJobTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerMergeEntitiesJobTest.java
@ -0,0 +1,8 @@
+package eu.dnetlib.dhp.graph.scholexplorer;
+
+
+
+public class SparkScholexplorerMergeEntitiesJobTest {
+
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/SparkGraphImporterJobTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/SparkGraphImporterJobTest.java
@ -0,0 +1,54 @@
+package eu.dnetlib.dhp.oa.graph;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.nio.file.Path;
+
+public class SparkGraphImporterJobTest {
+
+    private final static String TEST_DB_NAME = "test";
+
+    @Test
+    public void testImport(@TempDir Path outPath) {
+        try(SparkSession spark = testSparkSession(outPath.toString())) {
+
+            new SparkGraphImporterJob().runWith(
+                    spark,
+                    getClass().getResource("/eu/dnetlib/dhp/graph/sample").getPath(),
+                    TEST_DB_NAME);
+
+            GraphMappingUtils.types.forEach((name, clazz) -> {
+                final long count = spark.read().table(TEST_DB_NAME + "." + name).count();
+                if (name.equals("relation")) {
+                    Assertions.assertEquals(100, count, String.format("%s should be 100", name));
+                } else {
+                    Assertions.assertEquals(10, count, String.format("%s should be 10", name));
+                }
+            });
+        }
+    }
+
+    private SparkSession testSparkSession(final String inputPath) {
+        SparkConf conf = new SparkConf();
+
+        conf.set("spark.driver.host", "localhost");
+        conf.set("hive.metastore.local", "true");
+        conf.set("hive.metastore.warehouse.dir", inputPath + "/warehouse");
+        conf.set("spark.sql.warehouse.dir", inputPath);
+        conf.set("javax.jdo.option.ConnectionURL", String.format("jdbc:derby:;databaseName=%s/junit_metastore_db;create=true", inputPath));
+        conf.set("spark.ui.enabled", "false");
+
+        return SparkSession
+                .builder()
+                .appName(SparkGraphImporterJobTest.class.getSimpleName())
+                .master("local[*]")
+                .config(conf)
+                .enableHiveSupport()
+                .getOrCreate();
+    }
+
+}
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/dataset/dataset_10.json.gz
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/dataset/dataset_10.json.gz
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/scholexplorer/dmf.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/scholexplorer/dmf.xml
@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<oai:record xmlns:oai="http://www.openarchives.org/OAI/2.0/"
+            xmlns:oaf="http://namespace.openaire.eu/oaf"
+            xmlns:dri="http://www.driver-repository.eu/namespace/dri"
+            xmlns:dc="http://purl.org/dc/elements/1.1/">
+   <oai:header>
+      <dri:repositoryId>aaadf8b3-01a8-4cc2-9964-63cfb19df3b4_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU=</dri:repositoryId>
+      <dri:recordIdentifier>oai:pangaea.de:doi:10.1594/PANGAEA.821876</dri:recordIdentifier>
+      <dri:datasourceprefix>r3d100010134</dri:datasourceprefix>
+      <dri:objIdentifier>r3d100010134::000083be706192d2d839915694ecfd47</dri:objIdentifier>
+<dri:resolvedDate>2020-01-08T04:12:12.287</dri:resolvedDate>
+      <dri:dateOfCollection>2020-01-08T03:24:10.865Z</dri:dateOfCollection>
+      <oaf:datasourceprefix/>
+      <identifier>oai:pangaea.de:doi:10.1594/PANGAEA.821876</identifier>
+      <setSpec>citable</setSpec>
+   </oai:header>
+   <metadata>
+	<resource xmlns="http://datacite.org/schema/kernel-3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://datacite.org/schema/kernel-3 http://schema.datacite.org/meta/kernel-3/metadata.xsd">
+		<identifier identifierType="doi">10.1594/pangaea.821876</identifier>
+		<creators> <creator><creatorName>Macke, Andreas</creatorName></creator><creator><creatorName>Kalisch, John</creatorName></creator> </creators>
+		<titles> <title>Total Sky Imager observations during POLARSTERN cruise ANT-XXVI/4 on 2010-05-14 with links to images</title>  </titles>
+
+<publisher>PANGAEA - Data Publisher for Earth &amp; Environmental Science</publisher>
+		<dates>
+			<date dateType="Collected">2010-05-14T00:13:47/2010-05-14T23:55:47</date>
+		</dates>
+		<subjects>
+
+	        <subject subjectScheme="Parameter">DATE/TIME</subject>
+
+	        <subject subjectScheme="Parameter">LATITUDE</subject>
+
+	        <subject subjectScheme="Parameter">LONGITUDE</subject>
+
+	        <subject subjectScheme="Parameter">Uniform resource locator/link to image</subject>
+
+	        <subject subjectScheme="Method">Total Sky Imager</subject>
+
+	        <subject subjectScheme="Campaign">ANT-XXVI/4</subject>
+
+	        <subject subjectScheme="Basis">Polarstern</subject>
+
+		</subjects>
+		<resourceType resourceTypeGeneral="dataset">dataset</resourceType>
+		<relatedIdentifiers>
+
+	            <relatedIdentifier relatedIdentifierType="dnet" relationType="isPartOf" inverseRelationType="hasPart" entityType="dataset">dli_resolver::cf447a378b0b6603593f8b0e57242695</relatedIdentifier>
+
+	            <relatedIdentifier relatedIdentifierType="URL" relationType="references" inverseRelationType="isReferencedBy" entityType="unknown">http://hs.pangaea.de/images/airphoto/ps/ps75/2010-05-14/ant-xxvi_4_2010-05-14_tsi-images-links.zip</relatedIdentifier>
+
+	            <relatedIdentifier relatedIdentifierType="dnet" relationType="references" inverseRelationType="isReferencedBy" entityType="publication">dli_resolver::f0f5975d20991cffd222c6002ddd5821</relatedIdentifier>
+
+	    </relatedIdentifiers>
+	</resource>
+</metadata>
+<oaf:about xmlns:oaf="http://namespace.dnet.eu/oaf">
+	<oaf:datainfo >
+		<oaf:completionStatus>complete</oaf:completionStatus>
+
+  <oaf:collectedFrom id="dli_________::r3d100010134" name="Pangaea" completionStatus="complete"/>
+
+	</oaf:datainfo>
+</oaf:about>
+
+
+</oai:record>
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/scholexplorer/t.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/scholexplorer/t.xml
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/dataset/dataset_10.json.gz
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/dataset/dataset_10.json.gz
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/datasource/datasource_10.json.gz
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/datasource/datasource_10.json.gz
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/organization/organization_10.json.gz
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/organization/organization_10.json.gz
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/otherresearchproduct/otherresearchproduct_10.json.gz
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/otherresearchproduct/otherresearchproduct_10.json.gz
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/project/project_10.json.gz
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/project/project_10.json.gz
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/publication/publication_10.json.gz
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/publication/publication_10.json.gz
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/relation/relation_100.json.gz
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/relation/relation_100.json.gz
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/software/software_10.json.gz
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/sample/software/software_10.json.gz
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml
@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>dhp-workflows</artifactId>
+        <groupId>eu.dnetlib.dhp</groupId>
+        <version>1.1.6-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <artifactId>dhp-graph-provision-scholexplorer</artifactId>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>net.alchim31.maven</groupId>
+                <artifactId>scala-maven-plugin</artifactId>
+                <version>4.0.1</version>
+                <executions>
+                    <execution>
+                        <id>scala-compile-first</id>
+                        <phase>initialize</phase>
+                        <goals>
+                            <goal>add-source</goal>
+                            <goal>compile</goal>
+                        </goals>
+                    </execution>
+                    <execution>
+                        <id>scala-test-compile</id>
+                        <phase>process-test-resources</phase>
+                        <goals>
+                            <goal>testCompile</goal>
+                        </goals>
+                    </execution>
+                </executions>
+                <configuration>
+                    <scalaVersion>${scala.version}</scalaVersion>
+                </configuration>
+            </plugin>
+        </plugins>
+
+    </build>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-core_2.11</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-sql_2.11</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-common</artifactId>
+            <version>${project.version}</version>
+
+        </dependency>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-schemas</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.elasticsearch</groupId>
+            <artifactId>elasticsearch-hadoop</artifactId>
+
+        </dependency>
+
+    </dependencies>
+
+
+</project>
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/DatasetJoiner.scala
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/DatasetJoiner.scala
@ -0,0 +1,29 @@
+package eu.dnetlib.dhp.provision
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.functions.{coalesce, col, count, lit}
+
+object DatasetJoiner {
+
+  def startJoin(spark: SparkSession, relPath:String, targetPath:String) {
+    val relation = spark.read.load(relPath)
+
+    val relatedPublication = relation.where("target like '50%'").groupBy("source").agg(count("target").as("publication")).select(col("source"). alias("p_source"), col("publication"))
+    val relatedDataset = relation.where("target like '60%'").groupBy("source").agg(count("target").as("dataset")).select(col("source"). alias("d_source"), col("dataset"))
+    val relatedUnknown = relation.where("target like '70%'").groupBy("source").agg(count("target").as("unknown")).select(col("source"). alias("u_source"), col("unknown"))
+    val firstJoin = relatedPublication
+        .join(relatedDataset,col("p_source").equalTo(col("d_source")),"full")
+      .select(coalesce(col("p_source"), col("d_source")).alias("id"),
+              col("publication"),
+              col("dataset"))
+      .join(relatedUnknown, col("u_source").equalTo(col("id")),"full")
+      .select(coalesce(col("u_source"), col("id")).alias("source"),
+        coalesce(col("publication"),lit(0)).alias("relatedPublication"),
+        coalesce(col("dataset"),lit(0)).alias("relatedDataset"),
+        coalesce(col("unknown"),lit(0)).alias("relatedUnknown")
+      )
+    firstJoin.write.mode("overwrite").save(targetPath)
+
+  }
+
+}
--- a/Show More
+++ b/Show More