first implementation of incremental update of scholix index

2020-03-31 14:41:35 +02:00 · 2020-03-31 14:41:35 +02:00 · cd7416ae4c
parent 62cc257e5c
commit cd7416ae4c
18 changed files with 2883 additions and 52 deletions
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java
@ -29,7 +29,7 @@ import java.util.List;
 public class SparkCreateConnectedComponent {

    public static void main(String[] args) throws Exception {
-        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json")));
        parser.parseArgument(args);
        final SparkSession spark = SparkSession
                .builder()
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java
@ -11,7 +11,7 @@ import org.apache.spark.sql.SparkSession;

 public class SparkCreateDedupRecord {
    public static void main(String[] args) throws Exception {
-        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json")));
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json")));
        parser.parseArgument(args);
        final SparkSession spark = SparkSession
                .builder()
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java
@ -29,7 +29,7 @@ import java.util.List;
 public class SparkCreateSimRels {

    public static void main(String[] args) throws Exception {
-        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json")));
        parser.parseArgument(args);
        final SparkSession spark = SparkSession
                .builder()
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java
@ -23,7 +23,7 @@ public class SparkPropagateRelationsJob {
    final static String TARGETJSONPATH = "$.target";

    public static void main(String[] args) throws Exception {
-        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkPropagateRelationsJob.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_propagate_relation_parameters.json")));
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkPropagateRelationsJob.class.getResourceAsStream("/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json")));
        parser.parseArgument(args);
        final SparkSession spark = SparkSession
                .builder()
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java
@ -26,7 +26,7 @@ public class SparkUpdateEntityJob {
    final static String IDJSONPATH = "$.id";

    public static void main(String[] args) throws Exception {
-        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntityJob.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_delete_by_inference_parameters.json")));
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntityJob.class.getResourceAsStream("/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json")));
        parser.parseArgument(args);
        final SparkSession spark = SparkSession
                .builder()
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json
@ -0,0 +1,33 @@
+[
+  {
+    "paramName": "mt",
+    "paramLongName": "master",
+    "paramDescription": "should be local or yarn",
+    "paramRequired": true
+  },
+  {
+    "paramName": "s",
+    "paramLongName": "sourcePath",
+    "paramDescription": "the path of the sequential file to read",
+    "paramRequired": true
+  },
+  {
+    "paramName": "e",
+    "paramLongName": "entity",
+    "paramDescription": "the type of entity to be deduped",
+    "paramRequired": true
+  },
+  {
+    "paramName": "c",
+    "paramLongName": "dedupConf",
+    "paramDescription": "dedup configuration to be used",
+    "compressed": true,
+    "paramRequired": true
+  },
+  {
+    "paramName": "d",
+    "paramLongName": "dedupPath",
+    "paramDescription": "dedup path to load mergeRelation",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json
@ -0,0 +1,33 @@
+[
+  {
+    "paramName": "mt",
+    "paramLongName": "master",
+    "paramDescription": "should be local or yarn",
+    "paramRequired": true
+  },
+  {
+    "paramName": "s",
+    "paramLongName": "sourcePath",
+    "paramDescription": "the path of the sequential file to read",
+    "paramRequired": true
+  },
+  {
+    "paramName": "e",
+    "paramLongName": "entity",
+    "paramDescription": "the type of entity to be deduped",
+    "paramRequired": true
+  },
+  {
+    "paramName": "c",
+    "paramLongName": "dedupConf",
+    "paramDescription": "dedup configuration to be used",
+    "compressed": true,
+    "paramRequired": true
+  },
+  {
+    "paramName": "t",
+    "paramLongName": "targetPath",
+    "paramDescription": "target path to save dedup result",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/oozie_app/workflow.xml
@ -49,7 +49,7 @@
            <mode>cluster</mode>
            <name>Create Similarity Relations</name>
            <class>eu.dnetlib.dedup.SparkCreateSimRels</class>
-            <jar>dhp-dedup-${projectVersion}.jar</jar>
+            <jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
@ -73,7 +73,7 @@
            <mode>cluster</mode>
            <name>Create Connected Components</name>
            <class>eu.dnetlib.dedup.SparkCreateConnectedComponent</class>
-            <jar>dhp-dedup-${projectVersion}.jar</jar>
+            <jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
@ -97,7 +97,7 @@
            <mode>cluster</mode>
            <name>Create Dedup Record</name>
            <class>eu.dnetlib.dedup.SparkCreateDedupRecord</class>
-            <jar>dhp-dedup-${projectVersion}.jar</jar>
+            <jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
@ -121,7 +121,7 @@
            <mode>cluster</mode>
            <name>Propagate Dedup Relations</name>
            <class>eu.dnetlib.dedup.sx.SparkPropagateRelationsJob</class>
-            <jar>dhp-dedup-${projectVersion}.jar</jar>
+            <jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
@ -145,7 +145,7 @@
            <mode>cluster</mode>
            <name>Update ${entity} and add DedupRecord</name>
            <class>eu.dnetlib.dedup.sx.SparkUpdateEntityJob</class>
-            <jar>dhp-dedup-${projectVersion}.jar</jar>
+            <jar>dhp-dedup-scholexplorer-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
                --driver-memory=${sparkDriverMemory}
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/Datacite2Scholix.java
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/Datacite2Scholix.java
@ -0,0 +1,203 @@
+package eu.dnetlib.dhp.provision;
+
+import com.jayway.jsonpath.JsonPath;
+import eu.dnetlib.dhp.provision.scholix.*;
+import eu.dnetlib.dhp.utils.DHPUtils;
+import eu.dnetlib.scholexplorer.relation.RelInfo;
+import eu.dnetlib.scholexplorer.relation.RelationMapper;
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+public class Datacite2Scholix {
+
+
+    final RelationMapper relationMapper;
+
+    public Datacite2Scholix(RelationMapper relationMapper) {
+        this.relationMapper = relationMapper;
+    }
+
+
+    public List<Scholix> generateScholixFromJson(final String dJson) {
+
+        List<Map<String, String>> relIds = getRelatedIendtifiers(dJson);
+        relIds = relIds!= null ? relIds.stream().filter(m->
+                m.containsKey("relatedIdentifierType") && m.containsKey("relationType" ) && m.containsKey( "relatedIdentifier")
+        ).collect(Collectors.toList()) : null;
+        if(relIds== null || relIds.size() ==0 )
+            return null;
+
+
+
+        final String updated = JsonPath.read(dJson,"$.attributes.updated" );
+        ScholixResource resource = generateDataciteScholixResource(dJson);
+
+        return relIds.stream().flatMap(s-> {
+            final List<Scholix> result = generateScholix(resource, s.get("relatedIdentifier"), s.get("relatedIdentifierType"), s.get("relationType"), updated);
+            return result.stream();
+        }).collect(Collectors.toList());
+
+    }
+
+
+    private List<Scholix> generateScholix(ScholixResource source, final String pid, final String pidtype, final String relType, final String updated) {
+
+
+        if ("doi".equalsIgnoreCase(pidtype)) {
+            ScholixResource target = new ScholixResource();
+            target.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, pidtype)));
+            final RelInfo relInfo = relationMapper.get(relType.toLowerCase());
+            final ScholixRelationship rel = new ScholixRelationship(relInfo.getOriginal(), "datacite", relInfo.getInverse());
+            final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider();
+            final Scholix s = new Scholix();
+            s.setSource(source);
+            s.setTarget(target);
+            s.setLinkprovider(Collections.singletonList(provider));
+            s.setPublisher(source.getPublisher());
+            s.setRelationship(rel);
+            s.setPublicationDate(updated);
+            return Collections.singletonList(s);
+        } else {
+            final List<Scholix> result = new ArrayList<>();
+            ScholixResource target = new ScholixResource();
+            target.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, pidtype)));
+            target.setDnetIdentifier(generateId(pid, pidtype, "unknown"));
+            target.setObjectType("unknown");
+            target.setCollectedFrom(generateDataciteCollectedFrom("incomplete"));
+            final RelInfo relInfo = relationMapper.get(relType.toLowerCase());
+            final ScholixRelationship rel = new ScholixRelationship(relInfo.getOriginal(), "datacite", relInfo.getInverse());
+            final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider();
+            final Scholix s = new Scholix();
+            s.setSource(source);
+            s.setTarget(target);
+            s.setLinkprovider(Collections.singletonList(provider));
+            s.setPublisher(source.getPublisher());
+            s.setRelationship(rel);
+            s.setPublicationDate(updated);
+            s.generateIdentifier();
+            result.add(s);
+            final Scholix s2 = new Scholix();
+            s2.setSource(target);
+            s2.setTarget(source);
+            s2.setLinkprovider(Collections.singletonList(provider));
+            s2.setPublisher(source.getPublisher());
+            s2.setRelationship(new ScholixRelationship(relInfo.getInverse(), "datacite", relInfo.getOriginal()));
+            s2.setPublicationDate(updated);
+            s2.generateIdentifier();
+            result.add(s2);
+            return result;
+        }
+
+
+
+    }
+
+    public ScholixResource generateDataciteScholixResource(String dJson) {
+        ScholixResource resource = new ScholixResource();
+        String DOI_PATH = "$.attributes.doi";
+        final String doi = JsonPath.read(dJson, DOI_PATH);
+        resource.setIdentifier(Collections.singletonList(new ScholixIdentifier(doi, "doi")));
+        resource.setObjectType(getType(dJson));
+        resource.setDnetIdentifier(generateId(doi, "doi", resource.getObjectType()));
+        resource.setCollectedFrom(generateDataciteCollectedFrom("complete"));
+        final String publisher = JsonPath.read(dJson, "$.attributes.publisher");
+        if (StringUtils.isNotBlank(publisher))
+            resource.setPublisher(Collections.singletonList(new ScholixEntityId(publisher, null)));
+        final String date = getDate(dJson);
+        if (StringUtils.isNotBlank(date))
+            resource.setPublicationDate(date);
+        final String title = getTitle(dJson);
+        if(StringUtils.isNotBlank(title))
+            resource.setTitle(title);
+        resource.setCreator(getCreators(dJson));
+        return resource;
+    }
+
+    private List<ScholixEntityId> getCreators(final String json) {
+        final List<String> creatorName = JsonPath.read(json, "$.attributes.creators[*].name");
+        if (creatorName!= null && creatorName.size() >0) {
+            return  creatorName.stream().map(s-> new ScholixEntityId(s, null)).collect(Collectors.toList());
+        }
+        return null;
+    }
+
+    private String getTitle(final String json){
+        final List<String> titles = JsonPath.read(json, "$.attributes.titles[*].title");
+        return titles!= null && titles.size()>0?titles.get(0): null;
+    }
+
+    private String getDate(final String json) {
+        final  List<Map<String,String>> dates = JsonPath.read(json,"$.attributes.dates");
+        if(dates!= null && dates.size()>0){
+
+            List<Map<String, String>> issued = dates.stream().filter(s -> "issued".equalsIgnoreCase(s.get("dateType"))).collect(Collectors.toList());
+            if (issued.size()>0)
+                return issued.get(0).get("date");
+        }
+        return null;
+    }
+
+    private List<ScholixCollectedFrom> generateDataciteCollectedFrom(final String completionStatus) {
+        final ScholixEntityId scholixEntityId = new ScholixEntityId("Datasets in Datacite",
+                Collections.singletonList(new ScholixIdentifier("dli_________::datacite", "dnet_identifier")));
+        return Collections.singletonList(
+                new ScholixCollectedFrom(
+                        scholixEntityId,"collected", completionStatus));
+    }
+
+    private String getType(final String json) {
+        try {
+            final String bibtext = JsonPath.read(json, "$.attributes.types.bibtex");
+            if ("article".equalsIgnoreCase(bibtext)) {
+                return "publication";
+            }
+            return "dataset";
+        } catch (Throwable e) {
+            return "dataset";
+        }
+    }
+
+
+
+
+    private List<Map<String, String>> getRelatedIendtifiers(final String json) {
+        String REL_IDENTIFIER_PATH = "$.attributes.relatedIdentifiers[*]";
+        List<Map<String, String>> res = JsonPath.read(json, REL_IDENTIFIER_PATH);
+        return res;
+
+    }
+
+    protected String generateId(final String pid, final String pidType, final String entityType) {
+        String type;
+        switch (entityType){
+            case "publication":
+                type = "50|";
+                break;
+            case "dataset":
+                type = "60|";
+                break;
+            case "unknown":
+                type = "70|";
+                break;
+            default:
+                throw new IllegalArgumentException("unexpected value "+entityType);
+
+        }
+
+        return type+ DHPUtils.md5(String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim()));
+    }
+
+
+
+
+
+
+
+
+
+}
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/DataciteClient.java
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/DataciteClient.java
@ -0,0 +1,114 @@
+package eu.dnetlib.dhp.provision;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.jayway.jsonpath.JsonPath;
+import net.minidev.json.JSONArray;
+import org.apache.commons.io.IOUtils;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.entity.StringEntity;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Collectors;
+
+public class DataciteClient implements Iterator<String> {
+
+    final static String blobPath = "$.hits.hits[*]._source";
+    final static String scrollIdPath = "$._scroll_id";
+
+    String scrollId;
+
+    List<String> buffer;
+
+    final String esHost;
+    final String esIndex;
+    final ObjectMapper mapper = new ObjectMapper();
+
+    public DataciteClient(final String esHost, final String esIndex, final long timestamp) throws IOException {
+
+        this.esHost = esHost;
+        this.esIndex = esIndex;
+        final String body =getResponse(String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), String.format("{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}", timestamp));
+        scrollId= getJPathString(scrollIdPath, body);
+        buffer = getBlobs(body);
+
+    }
+
+
+    public String getResponse(final String url,final String json ) {
+        CloseableHttpClient client = HttpClients.createDefault();
+        try {
+
+            HttpPost httpPost = new HttpPost(url);
+            if (json!= null) {
+                StringEntity entity = new StringEntity(json);
+                httpPost.setEntity(entity);
+                httpPost.setHeader("Accept", "application/json");
+                httpPost.setHeader("Content-type", "application/json");
+            }
+            CloseableHttpResponse response = client.execute(httpPost);
+
+            return IOUtils.toString(response.getEntity().getContent());
+        } catch (Throwable e) {
+            throw new RuntimeException("Error on executing request ",e);
+        } finally {
+            try {
+                client.close();
+            } catch (IOException e) {
+                throw new RuntimeException("Unable to close client ",e);
+            }
+        }
+
+    }
+
+    private String  getJPathString(final String jsonPath, final String json) {
+        try {
+            Object o = JsonPath.read(json, jsonPath);
+            if (o instanceof String)
+                return (String) o;
+            return null;
+        } catch (Exception e) {
+            return "";
+        }
+    }
+
+    private List<String> getBlobs(final String body) {
+        JSONArray array = JsonPath.read(body, blobPath);
+        return array.stream().map(
+                o -> {
+                    try {
+                        return mapper.writeValueAsString(o);
+                    } catch (Throwable e) {
+                        throw new RuntimeException(e);
+                    }
+                }
+                ).collect(Collectors.toList());
+    }
+
+
+    @Override
+    public boolean hasNext() {
+        return (buffer!= null && !buffer.isEmpty());
+
+    }
+
+    @Override
+    public String next() {
+        final String nextItem = buffer.remove(0);
+        if (buffer.isEmpty()) {
+            final String json_param = String.format("{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}", scrollId);
+            final String body =getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param);
+            try {
+                buffer = getBlobs(body);
+            } catch (Throwable e) {
+                System.out.println(body);
+
+            }
+
+        }
+        return nextItem;
+    }
+}
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/RetrieveUpdateFromDatacite.java
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/RetrieveUpdateFromDatacite.java
@ -0,0 +1,44 @@
+package eu.dnetlib.dhp.provision;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+
+import java.net.URI;
+import java.util.concurrent.atomic.AtomicInteger;
+
+public class RetrieveUpdateFromDatacite {
+
+    public static void main(String[] args) throws Exception{
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(RetrieveUpdateFromDatacite.class.getResourceAsStream("/eu/dnetlib/dhp/provision/retrieve_update_parameters.json")));
+        parser.parseArgument(args);
+        final String hdfsuri = parser.get("namenode");
+        Path hdfswritepath = new Path(parser.get("targetPath"));
+        final String timestamp = parser.get("timestamp");
+
+
+        // ====== Init HDFS File System Object
+        Configuration conf = new Configuration();
+        // Set FileSystem URI
+        conf.set("fs.defaultFS", hdfsuri);
+        // Because of Maven
+        conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
+        conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
+
+        FileSystem.get(URI.create(hdfsuri), conf);
+
+        final AtomicInteger counter = new AtomicInteger(0);
+        try (SequenceFile.Writer writer = SequenceFile.createWriter(conf,
+                SequenceFile.Writer.file(hdfswritepath), SequenceFile.Writer.keyClass(IntWritable.class),
+                SequenceFile.Writer.valueClass(Text.class))) {
+
+        }
+    }
+
+}
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkExtractRelationCount.java
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkExtractRelationCount.java
@ -44,35 +44,8 @@ public class SparkExtractRelationCount {
        final String workingDirPath = parser.get("workingDirPath");

        final String relationPath = parser.get("relationPath");
-
-
-
-
-
        DatasetJoiner.startJoin(spark, relationPath,workingDirPath + "/relatedItemCount");

-
-
-
-//        sc.textFile(relationPath)
-//                // We start to Filter the relation not deleted by Inference
-//                .filter(ProvisionUtil::isNotDeleted)
-//                // Then we create a PairRDD<String, RelatedItem>
-//                .mapToPair((PairFunction<String, String, RelatedItemInfo>) f
-//                        -> new Tuple2<>(DHPUtils.getJPathString(ProvisionUtil.SOURCEJSONPATH, f), ProvisionUtil.getItemType(f, ProvisionUtil.TARGETJSONPATH)))
-//                //We reduce and sum the number of Relations
-//                .reduceByKey((Function2<RelatedItemInfo, RelatedItemInfo, RelatedItemInfo>) (v1, v2) -> {
-//                    if (v1 == null && v2 == null)
-//                        return new RelatedItemInfo();
-//                    return v1 != null ? v1.add(v2) : v2;
-//                })
-//                //Set the source Id in RelatedItem object
-//                .map(k -> k._2().setId(k._1()))
-//                // Convert to JSON and save as TextFile
-//                .map(k -> {
-//                    ObjectMapper mapper = new ObjectMapper();
-//                    return mapper.writeValueAsString(k);
-//                }).saveAsTextFile(workingDirPath + "/relatedItemCount", GzipCodec.class);
    }


--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/retrieve_update_parameters.json
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/provision/retrieve_update_parameters.json
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/graph/Application/provision/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/graph/Application/provision/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/graph/Application/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/graph/Application/provision/oozie_app/workflow.xml
@ -8,10 +8,6 @@
            <name>graphPath</name>
            <description>the graph path</description>
        </property>
-        <property>
-            <name>index</name>
-            <description>index name</description>
-        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
@ -21,17 +17,16 @@
            <description>memory for individual executor</description>
        </property>
        <property>
-            <name>sparkExecutorCores</name>
-            <description>number of cores used by single executor</description>
+            <name>index</name>
+            <description>index name</description>
        </property>
-
        <property>
            <name>idScholix</name>
-            <description>the </description>
+            <description>the identifier name of the scholix </description>
        </property>
        <property>
            <name>idSummary</name>
-            <description>number of cores used by single executor</description>
+            <description>the identifier name of the summary</description>
        </property>
    </parameters>

@ -57,7 +52,7 @@
            <mode>cluster</mode>
            <name>calculate for each ID the number of related Dataset, publication and Unknown</name>
            <class>eu.dnetlib.dhp.provision.SparkExtractRelationCount</class>
-            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
            <spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
            <arg>-mt</arg> <arg>yarn-cluster</arg>
            <arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
@ -75,7 +70,7 @@
            <mode>cluster</mode>
            <name>generate Summary</name>
            <class>eu.dnetlib.dhp.provision.SparkGenerateSummary</class>
-            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
            <spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
            <arg>-mt</arg> <arg>yarn-cluster</arg>
            <arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
@ -93,7 +88,7 @@
            <mode>cluster</mode>
            <name>generate Scholix</name>
            <class>eu.dnetlib.dhp.provision.SparkGenerateScholix</class>
-            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
            <spark-opts>--executor-memory 6G --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
            <arg>-mt</arg> <arg>yarn-cluster</arg>
            <arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
@ -111,8 +106,8 @@
            <mode>cluster</mode>
            <name>index Summary</name>
            <class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
-            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
-            <spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="64" </spark-opts>
+            <jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
+            <spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="32" </spark-opts>
            <arg>-mt</arg> <arg>yarn-cluster</arg>
            <arg>--sourcePath</arg><arg>${workingDirPath}/summary</arg>
            <arg>--index</arg><arg>${index}_object</arg>
@ -131,7 +126,7 @@
            <mode>cluster</mode>
            <name>index scholix</name>
            <class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
-            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
            <spark-opts>--executor-memory ${sparkExecutorMemory}  --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" </spark-opts>
            <arg>-mt</arg> <arg>yarn-cluster</arg>
            <arg>--sourcePath</arg><arg>${workingDirPath}/scholix_json</arg>
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DataciteClientTest.java
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DataciteClientTest.java
@ -0,0 +1,109 @@
+package eu.dnetlib.dhp.provision;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.provision.scholix.Scholix;
+import eu.dnetlib.scholexplorer.relation.RelationMapper;
+import org.apache.commons.io.IOUtils;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.entity.StringEntity;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+import org.junit.jupiter.api.Test;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.List;
+
+
+public class DataciteClientTest {
+
+
+    @Test
+    public void dataciteSCholixTest() throws Exception {
+        final String json = IOUtils.toString(getClass().getResourceAsStream("datacite.json"));
+        final RelationMapper mapper = RelationMapper.load();
+
+        Datacite2Scholix ds = new Datacite2Scholix(mapper);
+        final List<Scholix> s = ds.generateScholixFromJson(json);
+
+
+        System.out.println(new ObjectMapper().writeValueAsString(s));
+
+    }
+
+
+    @Test
+    public void testClient() throws Exception {
+        DataciteClient client = new DataciteClient("ip-90-147-167-25.ct1.garrservices.it","datacite",1585454082);
+        int i = 0;
+        final RelationMapper mapper = RelationMapper.load();
+
+        Datacite2Scholix ds = new Datacite2Scholix(mapper);
+        BufferedWriter writer = new BufferedWriter(new FileWriter("/Users/sandro/new_s.txt"));
+
+        final ObjectMapper m  = new ObjectMapper();
+
+
+
+
+
+        while (client.hasNext()){
+            i ++;
+
+
+            final String next = client.next();
+            try {
+                final List<Scholix> res = ds.generateScholixFromJson(next);
+                if (res!= null)
+                res
+                        .forEach(
+                                s -> {
+                                    try {
+
+                                        writer.write(m.writeValueAsString(s));
+                                        writer.write("\n");
+                                    } catch (Throwable e) {
+                                        throw new RuntimeException(e);
+                                    }
+                                }
+
+
+                        );
+            }catch (Throwable t) {
+                System.out.println(next);
+                throw new RuntimeException(t);
+            }
+            if(i %1000 == 0) {
+                System.out.println("added "+i);
+            }
+        }
+    }
+
+    private String getResponse(final String url,final String json ) {
+        CloseableHttpClient client = HttpClients.createDefault();
+        try {
+
+            HttpPost httpPost = new HttpPost(url);
+            if (json!= null) {
+                StringEntity entity = new StringEntity(json);
+                httpPost.setEntity(entity);
+                httpPost.setHeader("Accept", "application/json");
+                httpPost.setHeader("Content-type", "application/json");
+            }
+            CloseableHttpResponse response = client.execute(httpPost);
+
+            return IOUtils.toString(response.getEntity().getContent());
+        } catch (Throwable e) {
+            throw new RuntimeException("Error on executing request ",e);
+        } finally {
+            try {
+                client.close();
+            } catch (IOException e) {
+                throw new RuntimeException("Unable to close client ",e);
+            }
+        }
+
+    }
+}
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/datacite.json
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/datacite.json
@ -0,0 +1,136 @@
+{
+  "relationships": {
+    "client": {
+      "data": {
+        "type": "clients",
+        "id": "crossref.citations"
+      }
+    }
+  },
+  "attributes": {
+    "contributors": [
+    ],
+    "titles": [
+      {
+        "title": "UV-visible spectroscopy in the interpretation of the tautomeric equilibrium of N,N′(bis-3,5-di-bromo-salicyliden)-1,2-diaminobenzene and the redox activity of its Co(II) complex. A quantum chemical approach."
+      }
+    ],
+    "descriptions": [
+    ],
+    "referenceCount": 0,
+    "subjects": [
+    ],
+    "container": {
+      "title": "Journal of Molecular Structure: THEOCHEM",
+      "firstPage": "97",
+      "volume": "367",
+      "lastPage": "110",
+      "identifierType": "ISSN",
+      "identifier": "0166-1280",
+      "type": "Journal"
+    },
+    "state": "findable",
+    "created": "2020-03-26T13:31:57.000Z",
+    "source": "levriero",
+    "metadataVersion": 0,
+    "version": null,
+    "isActive": true,
+    "contentUrl": null,
+    "geoLocations": [
+    ],
+    "updated": "2020-03-26T13:31:58.000Z",
+    "fundingReferences": [
+    ],
+    "viewCount": 0,
+    "registered": "2020-03-26T13:31:58.000Z",
+    "published": "1996",
+    "dates": [
+      {
+        "date": "1996-09",
+        "dateType": "Issued"
+      },
+      {
+        "date": "2019-04-17T13:58:25Z",
+        "dateType": "Updated"
+      }
+    ],
+    "relatedIdentifiers": [
+      {
+        "relationType": "IsPartOf",
+        "relatedIdentifier": "0166-1280",
+        "relatedIdentifierType": "ISSN",
+        "resourceTypeGeneral": "Collection"
+      }
+    ],
+    "reason": null,
+    "rightsList": [
+      {
+        "rightsUri": "https://www.elsevier.com/tdm/userlicense/1.0"
+      }
+    ],
+    "schemaVersion": "http://datacite.org/schema/kernel-4",
+    "types": {
+      "resourceType": "JournalArticle",
+      "ris": "JOUR",
+      "resourceTypeGeneral": "Text",
+      "bibtex": "article",
+      "citeproc": "article-journal",
+      "schemaOrg": "ScholarlyArticle"
+    },
+    "publisher": "Elsevier BV",
+    "publicationYear": 1996,
+    "doi": "10.1016/s0166-1280(96)04575-7",
+    "language": null,
+    "sizes": [
+    ],
+    "url": "https://linkinghub.elsevier.com/retrieve/pii/S0166128096045757",
+    "identifiers": [
+      {
+        "identifier": "https://doi.org/10.1016/s0166-1280(96)04575-7",
+        "identifierType": "DOI"
+      },
+      {
+        "identifier": "S0166128096045757",
+        "identifierType": "Publisher ID"
+      }
+    ],
+    "citationCount": 0,
+    "formats": [
+    ],
+    "downloadCount": 0,
+    "creators": [
+      {
+        "nameType": "Personal",
+        "givenName": "G.L.",
+        "name": "Estiú, G.L.",
+        "familyName": "Estiú",
+        "affiliation": [
+        ]
+      },
+      {
+        "nameType": "Personal",
+        "givenName": "A.H.",
+        "name": "Jubert, A.H.",
+        "familyName": "Jubert",
+        "affiliation": [
+        ]
+      },
+      {
+        "nameType": "Personal",
+        "givenName": "J.",
+        "name": "Costamagna, J.",
+        "familyName": "Costamagna",
+        "affiliation": [
+        ]
+      },
+      {
+        "nameType": "Personal",
+        "givenName": "J.",
+        "name": "Vargas, J.",
+        "familyName": "Vargas",
+        "affiliation": [
+        ]
+      }
+    ]
+  }
+}
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/es.json
+++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/resources/eu/dnetlib/dhp/provision/es.json