Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop into orcid-no-doi

2020-11-23 09:49:53 +01:00 · 2020-11-23 09:49:53 +01:00 · 5c9a727895
parent 97c8111847 d48f388fb2
commit 5c9a727895
28 changed files with 1305 additions and 1378 deletions
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java
@ -5,12 +5,16 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;

+import org.apache.commons.lang3.StringUtils;
+
 import eu.dnetlib.broker.objects.OaBrokerMainEntity;
 import eu.dnetlib.dhp.broker.model.Topic;
 import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;

 public class EnrichMissingAbstract extends UpdateMatcher<String> {

+	private static final int MIN_LENGTH = 200;
+
 	public EnrichMissingAbstract() {
 		super(1,
 			s -> Topic.ENRICH_MISSING_ABSTRACT,
@ -21,10 +25,15 @@ public class EnrichMissingAbstract extends UpdateMatcher<String> {
 	@Override
 	protected List<String> findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) {
 		if (isMissing(target.getAbstracts()) && !isMissing(source.getAbstracts())) {
-			return Arrays.asList(source.getAbstracts().get(0));
-		} else {
-			return new ArrayList<>();
+			return source
+				.getAbstracts()
+				.stream()
+				.filter(s -> StringUtils.normalizeSpace(s).length() >= MIN_LENGTH)
+				.map(Arrays::asList)
+				.findFirst()
+				.orElse(new ArrayList<>());
 		}
+		return new ArrayList<>();
 	}

 }
--- a/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-broker-events/src/main/resources/eu/dnetlib/dhp/broker/oa/notifications_only/oozie_app/workflow.xml
@ -1,4 +1,4 @@
-<workflow-app name="create broker events - partial" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="update broker notifications" xmlns="uri:oozie:workflow:0.5">

    <parameters>
        <property>
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java
@ -1,10 +1,36 @@

 package eu.dnetlib.dhp.oa.graph.raw;

-import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
-import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.NOT_AVAILABLE;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
+import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.dataInfo;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.journal;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.keyValue;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listFields;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.oaiIProvenance;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.qualifier;
+import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;

-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;

 import org.apache.commons.lang3.StringUtils;
 import org.dom4j.Document;
@ -15,7 +41,24 @@ import org.dom4j.Node;
 import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
 import eu.dnetlib.dhp.schema.common.LicenseComparator;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.oaf.Context;
+import eu.dnetlib.dhp.schema.oaf.DataInfo;
+import eu.dnetlib.dhp.schema.oaf.Dataset;
+import eu.dnetlib.dhp.schema.oaf.Field;
+import eu.dnetlib.dhp.schema.oaf.GeoLocation;
+import eu.dnetlib.dhp.schema.oaf.Instance;
+import eu.dnetlib.dhp.schema.oaf.Journal;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import eu.dnetlib.dhp.schema.oaf.Software;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;

 public abstract class AbstractMdRecordToOafMapper {

@ -92,10 +135,10 @@ public abstract class AbstractMdRecordToOafMapper {
 	}

 	protected String getResultType(final Document doc, final List<Instance> instances) {
-		String type = doc.valueOf("//dr:CobjCategory/@type");
+		final String type = doc.valueOf("//dr:CobjCategory/@type");

 		if (StringUtils.isBlank(type) & vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
-			String instanceType = instances
+			final String instanceType = instances
 				.stream()
 				.map(i -> i.getInstancetype().getClassid())
 				.findFirst()
@ -256,13 +299,11 @@ public abstract class AbstractMdRecordToOafMapper {
 		r.setDataInfo(info);
 		r.setLastupdatetimestamp(lastUpdateTimestamp);
 		r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
-
 		r.setOriginalId(Arrays.asList(findOriginalId(doc)));
-
 		r.setCollectedfrom(Arrays.asList(collectedFrom));
 		r.setPid(prepareResultPids(doc, info));
-		r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
-		r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation"));
+		r.setDateofcollection(doc.valueOf("//dr:dateOfCollection|//dri:dateOfCollection"));
+		r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation|//dri:dateOfTransformation"));
 		r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
 		r.setOaiprovenance(prepareOAIprovenance(doc));
 		r.setAuthor(prepareAuthors(doc, info));
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java
@ -162,15 +162,21 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
 		for (final Object o : doc.selectNodes("//datacite:date")) {
 			final String dateType = ((Node) o).valueOf("@dateType");
 			if (StringUtils.isBlank(dateType)
-				&& !dateType.equalsIgnoreCase("Accepted")
-				&& !dateType.equalsIgnoreCase("Issued")
-				&& !dateType.equalsIgnoreCase("Updated")
-				&& !dateType.equalsIgnoreCase("Available")) {
+				|| (!dateType.equalsIgnoreCase("Accepted")
+					&& !dateType.equalsIgnoreCase("Issued")
+					&& !dateType.equalsIgnoreCase("Updated")
+					&& !dateType.equalsIgnoreCase("Available"))) {
 				res
 					.add(
 						structuredProperty(
 							((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE,
 							info));
+			} else {
+				res
+					.add(
+						structuredProperty(
+							((Node) o).getText(), dateType, dateType, DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE,
+							info));
 			}
 		}
 		return res;
@ -341,7 +347,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
 							getRelation(
 								otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info,
 								lastUpdateTimestamp));
-				} else if (type.equals("IsPartOf")) {
+				} else if (type.equalsIgnoreCase("IsPartOf")) {

 					res
 						.add(
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java
@ -7,8 +7,6 @@ import static org.mockito.Mockito.lenient;
 import java.io.IOException;
 import java.util.List;
 import java.util.Set;
-import java.util.function.Predicate;
-import java.util.stream.Collectors;
 import java.util.stream.Stream;

 import org.apache.commons.io.IOUtils;
@ -21,7 +19,10 @@ import org.mockito.junit.jupiter.MockitoExtension;
 import com.fasterxml.jackson.databind.ObjectMapper;

 import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
-import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
@ -78,6 +78,8 @@ public class MappersTest {
 		assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
 		assertFalse(p.getDataInfo().getInvisible());
 		assertTrue(p.getSource().size() == 1);
+		assertTrue(StringUtils.isNotBlank(p.getDateofcollection()));
+		assertTrue(StringUtils.isNotBlank(p.getDateoftransformation()));

 		assertTrue(p.getAuthor().size() > 0);
 		final Optional<Author> author = p
@ -329,7 +331,7 @@ public class MappersTest {
 	@Test
 	void testODFRecord() throws IOException {
 		final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_record.xml"));
-		List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
+		final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
 		System.out.println("***************");
 		System.out.println(new ObjectMapper().writeValueAsString(list));
 		System.out.println("***************");
@ -340,6 +342,22 @@ public class MappersTest {
 		assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
 	}

+	@Test
+	void testTextGrid() throws IOException {
+		final String xml = IOUtils.toString(getClass().getResourceAsStream("textgrid.xml"));
+		final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
+
+		System.out.println("***************");
+		System.out.println(new ObjectMapper().writeValueAsString(list));
+		System.out.println("***************");
+
+		final Dataset p = (Dataset) list.get(0);
+		assertValidId(p.getId());
+		assertValidId(p.getCollectedfrom().get(0).getKey());
+		assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
+		System.out.println(p.getTitle().get(0).getValue());
+	}
+
 	private void assertValidId(final String id) {
 		assertEquals(49, id.length());
 		assertEquals('|', id.charAt(2));
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml
@ -7,13 +7,12 @@
  <header xmlns="http://namespace.openaire.eu/">
    <dri:objIdentifier>pensoft_____::00ea4a1cd53806a97d62ea6bf268f2a2</dri:objIdentifier>
    <dri:recordIdentifier>10.3897/oneeco.2.e13718</dri:recordIdentifier>
-    <dri:dateOfCollection/>
    <dri:mdFormat/>
    <dri:mdFormatInterpretation/>
    <dri:repositoryId/>
    <dr:objectIdentifier/>
-    <dr:dateOfCollection>2020-03-23T00:20:51.392Z</dr:dateOfCollection>
-    <dr:dateOfTransformation>2020-03-23T00:26:59.078Z</dr:dateOfTransformation>
+    <dri:dateOfCollection>2020-03-23T00:20:51.392Z</dri:dateOfCollection>
+    <dri:dateOfTransformation>2020-03-23T00:26:59.078Z</dri:dateOfTransformation>
    <oaf:datasourceprefix>pensoft_____</oaf:datasourceprefix>
  </header>
  <metadata xmlns="http://namespace.openaire.eu/">
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/textgrid.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/textgrid.xml
@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<record xmlns:dr="http://www.driver-repository.eu/namespace/dr"
+        xmlns:oaf="http://namespace.openaire.eu/oaf"
+        xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+    <oai:header xmlns="http://namespace.openaire.eu/"
+                xmlns:dc="http://purl.org/dc/elements/1.1/"
+                xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
+        <dri:objIdentifier>r3f52792889d::000051aa1f61d77d2c0b340091f8024e</dri:objIdentifier>
+        <dri:recordIdentifier>textgrid:q9cv.0</dri:recordIdentifier>
+        <dri:dateOfCollection>2020-11-17T09:34:11.128+01:00</dri:dateOfCollection>
+        <oaf:datasourceprefix>r3f52792889d</oaf:datasourceprefix>
+        <identifier xmlns="http://www.openarchives.org/OAI/2.0/">textgrid:q9cv.0</identifier>
+        <datestamp xmlns="http://www.openarchives.org/OAI/2.0/">2012-01-21T13:35:20Z</datestamp>
+        <dr:dateOfTransformation>2020-11-17T09:46:21.551+01:00</dr:dateOfTransformation>
+    </oai:header>
+    <metadata>
+        <datacite:resource xmlns="http://www.openarchives.org/OAI/2.0/"
+                           xmlns:datacite="http://datacite.org/schema/kernel-3"
+                           xmlns:dc="http://purl.org/dc/elements/1.1/"
+                           xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
+            <datacite:identifier identifierType="Handle">hdl:11858/00-1734-0000-0003-7664-F</datacite:identifier>
+            <datacite:creators>
+                <datacite:creator>
+                    <datacite:creatorName>Hoffmann von Fallersleben, August Heinrich</datacite:creatorName>
+                    <datacite:nameIdentifier nameIdentifierScheme="pnd" schemeURI="https://de.dariah.eu/pnd-service">118552589</datacite:nameIdentifier>
+                </datacite:creator>
+            </datacite:creators>
+            <datacite:titles>
+                <datacite:title titleType="Other">Mailied</datacite:title>
+                <datacite:title titleType="Other">August Heinrich Hoffmann von Fallersleben: Unpolitische Lieder von Hoffmann von Fallersleben, 1. + 2. Theil, 1. Theil, Hamburg: Hoffmann und Campe, 1841.</datacite:title>
+            </datacite:titles>
+            <datacite:publisher>TextGrid</datacite:publisher>
+            <datacite:publicationYear>2012</datacite:publicationYear>
+            <datacite:contributors>
+                <datacite:contributor contributorType="DataManager">
+                    <datacite:contributorName>tvitt@textgrid.de</datacite:contributorName>
+                </datacite:contributor>
+                <datacite:contributor contributorType="Other">
+                    <datacite:contributorName>Digitale Bibliothek</datacite:contributorName>
+                    <datacite:nameIdentifier nameIdentifierScheme="textgrid">TGPR-372fe6dc-57f2-6cd4-01b5-2c4bbefcfd3c</datacite:nameIdentifier>
+                </datacite:contributor>
+            </datacite:contributors>
+            <datacite:dates>
+                <datacite:date dateType="Created">2012-01-21T13:35:20Z</datacite:date>
+                <datacite:date dateType="Issued">2012-01-21T13:35:20Z</datacite:date>
+                <datacite:date dateType="Updated">2012-01-21T13:35:20Z</datacite:date>
+            </datacite:dates>
+            <datacite:resourceType resourceTypeGeneral="Dataset"/>
+            <alternateIdentifiers>
+                <datacite:alternateIdentifier alternateIdentifierType="URI">textgrid:q9cv.0</datacite:alternateIdentifier>
+                <alternateIdentifier alternateIdentifierType="URL">http://hdl.handle.net/hdl:11858/00-1734-0000-0003-7664-F</alternateIdentifier>
+            </alternateIdentifiers>
+            <datacite:relatedIdentifiers>
+                <datacite:relatedIdentifier relatedIdentifierType="Handle" relationType="IsPartOf">hdl:11858/00-1734-0000-0003-7666-B</datacite:relatedIdentifier>
+            </datacite:relatedIdentifiers>
+            <datacite:sizes>
+                <datacite:size>527 Bytes</datacite:size>
+            </datacite:sizes>
+            <datacite:formats>
+                <datacite:format>text/tg.edition+tg.aggregation+xml</datacite:format>
+            </datacite:formats>
+            <datacite:version>0</datacite:version>
+            <datacite:rightsList>
+                <datacite:rights rightsURI="http://creativecommons.org/licenses/by/3.0/de/legalcode"> Der annotierte Datenbestand der Digitalen Bibliothek inklusive
+                    Metadaten sowie davon einzeln zugängliche Teile sind eine Abwandlung
+                    des Datenbestandes von www.editura.de durch TextGrid und werden
+                    unter der Lizenz Creative Commons Namensnennung 3.0 Deutschland
+                    Lizenz (by-Nennung TextGrid) veröffentlicht. Die Lizenz bezieht sich
+                    nicht auf die der Annotation zu Grunde liegenden allgemeinfreien
+                    Texte (Siehe auch Punkt 2 der Lizenzbestimmungen).</datacite:rights>
+                <datacite:rights rightsURI="info:eu-repo/semantics/openAccess"/>
+            </datacite:rightsList>
+            <datacite:descriptions>
+                <datacite:description descriptionType="Abstract"/>
+            </datacite:descriptions>
+            <datacite:geoLocations>
+                <datacite:geoLocation>
+                    <datacite:geoLocationPlace
+                            xmlns:xs="http://www.w3.org/2001/XMLSchema" xsi:type="xs:string">Hamburg</datacite:geoLocationPlace>
+                </datacite:geoLocation>
+            </datacite:geoLocations>
+        </datacite:resource>
+        <oaf:identifier identifierType="handle">hdl:11858/00-1734-0000-0003-7664-F</oaf:identifier>
+        <dr:CobjCategory type="dataset">0021</dr:CobjCategory>
+        <oaf:refereed>0002</oaf:refereed>
+        <oaf:dateAccepted>2012-01-01</oaf:dateAccepted>
+        <oaf:accessrights>OPEN</oaf:accessrights>
+        <oaf:license>http://creativecommons.org/licenses/by/3.0/de/legalcode</oaf:license>
+        <oaf:language>und</oaf:language>
+        <oaf:hostedBy id="re3data_____::r3d100011365" name="TextGrid Repository"/>
+        <oaf:collectedFrom id="re3data_____::r3d100011365" name="TextGrid Repository"/>
+    </metadata>
+    <about xmlns:dc="http://purl.org/dc/elements/1.1/"
+           xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
+        <provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
+            <originDescription altered="true" harvestDate="2020-11-17T09:34:11.128+01:00">
+                <baseURL>https%3A%2F%2Fdev.textgridlab.org%2F1.0%2Ftgoaipmh%2Foai</baseURL>
+                <identifier>textgrid:q9cv.0</identifier>
+                <datestamp>2012-01-21T13:35:20Z</datestamp>
+                <metadataNamespace>http://schema.datacite.org/oai/oai-1.0/</metadataNamespace>
+            </originDescription>
+        </provenance>
+        <oaf:datainfo>
+            <oaf:inferred>false</oaf:inferred>
+            <oaf:deletedbyinference>false</oaf:deletedbyinference>
+            <oaf:trust>0.9</oaf:trust>
+            <oaf:inferenceprovenance/>
+            <oaf:provenanceaction classid="sysimport:crosswalk"
+                                  classname="sysimport:crosswalk"
+                                  schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
+        </oaf:datainfo>
+    </about>
+</record>
--- a/dhp-workflows/dhp-graph-provision/pom.xml
+++ b/dhp-workflows/dhp-graph-provision/pom.xml
@ -22,6 +22,12 @@
        <dependency>
            <groupId>com.jayway.jsonpath</groupId>
            <artifactId>json-path</artifactId>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.slf4j</groupId>
+                    <artifactId>slf4j-api</artifactId>
+                </exclusion>
+            </exclusions>
        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
@ -82,9 +88,6 @@
                    <groupId>org.codehaus.woodstox</groupId>
                    <artifactId>*</artifactId>
                </exclusion>
-
-
-
                <exclusion>
                    <groupId>com.github.ben-manes.caffeine</groupId>
                    <artifactId>*</artifactId>
@ -109,11 +112,10 @@
                    <groupId>org.apache.hadoop</groupId>
                    <artifactId>*</artifactId>
                </exclusion>
-
-
-
-
-
+                <exclusion>
+                    <groupId>org.apache.zookeeper</groupId>
+                    <artifactId>zookeeper</artifactId>
+                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java
@ -3,6 +3,10 @@ package eu.dnetlib.dhp.oa.provision;

 public class ProvisionConstants {

+	public static final String LAYOUT = "index";
+	public static final String INTERPRETATION = "openaire";
+	public static final String SEPARATOR = "-";
+
 	public static final int MAX_EXTERNAL_ENTITIES = 50;
 	public static final int MAX_AUTHORS = 200;
 	public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
@ -11,4 +15,8 @@ public class ProvisionConstants {
 	public static final int MAX_ABSTRACT_LENGTH = 100000;
 	public static final int MAX_INSTANCES = 10;

+	public static String getCollectionName(String format) {
+		return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
+	}
+
 }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java
@ -14,11 +14,12 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
 import eu.dnetlib.dhp.oa.provision.utils.ZkServers;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

-public class SolrAdminApplication extends SolrApplication implements Closeable {
+public class SolrAdminApplication implements Closeable {

 	private static final Logger log = LoggerFactory.getLogger(SolrAdminApplication.class);

@ -54,12 +55,12 @@ public class SolrAdminApplication extends SolrApplication implements Closeable {
 			.orElse(false);
 		log.info("commit: {}", commit);

-		final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
+		final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));

-		final String zkHost = getZkHost(isLookup);
+		final String zkHost = isLookup.getZkHost();
 		log.info("zkHost: {}", zkHost);

-		final String collection = format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
+		final String collection = ProvisionConstants.getCollectionName(format);
 		log.info("collection: {}", collection);

 		try (SolrAdminApplication app = new SolrAdminApplication(zkHost)) {
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrApplication.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrApplication.java
@ -1,40 +0,0 @@
-
-package eu.dnetlib.dhp.oa.provision;
-
-import org.apache.commons.lang3.StringUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
-import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
-
-public abstract class SolrApplication {
-
-	private static final Logger log = LoggerFactory.getLogger(SolrApplication.class);
-
-	protected static final String LAYOUT = "index";
-	protected static final String INTERPRETATION = "openaire";
-	protected static final String SEPARATOR = "-";
-	protected static final String DATE_FORMAT = "yyyy-MM-dd'T'hh:mm:ss'Z'";
-
-	/**
-	 * Method retrieves from the information system the zookeeper quorum of the Solr server
-	 *
-	 * @param isLookup
-	 * @return the zookeeper quorum of the Solr server
-	 * @throws ISLookUpException
-	 */
-	protected static String getZkHost(ISLookUpService isLookup) throws ISLookUpException {
-		return doLookup(
-			isLookup,
-			"for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()");
-	}
-
-	protected static String doLookup(ISLookUpService isLookup, String xquery) throws ISLookUpException {
-		log.info(String.format("running xquery: %s", xquery));
-		final String res = isLookup.getResourceProfileByQuery(xquery);
-		log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ..."));
-		return res;
-	}
-
-}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
@ -10,6 +10,7 @@ import java.text.SimpleDateFormat;
 import java.util.Date;
 import java.util.Optional;

+import javax.swing.text.html.Option;
 import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerException;
 import javax.xml.transform.stream.StreamResult;
@ -20,27 +21,48 @@ import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import com.lucidworks.spark.util.SolrSupport;

 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
+import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
 import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
-import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
-import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

-public class XmlIndexingJob extends SolrApplication {
+public class XmlIndexingJob {

 	private static final Logger log = LoggerFactory.getLogger(XmlIndexingJob.class);

+	public enum OutputFormat {
+		SOLR, HDFS
+	}
+
 	private static final Integer DEFAULT_BATCH_SIZE = 1000;

+	protected static final String DATE_FORMAT = "yyyy-MM-dd'T'hh:mm:ss'Z'";
+
+	private String inputPath;
+
+	private String format;
+
+	private int batchSize;
+
+	private OutputFormat outputFormat;
+
+	private String outputPath;
+
+	private SparkSession spark;
+
 	public static void main(String[] args) throws Exception {

 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -60,27 +82,64 @@ public class XmlIndexingJob extends SolrApplication {
 		final String inputPath = parser.get("inputPath");
 		log.info("inputPath: {}", inputPath);

-		final String isLookupUrl = parser.get("isLookupUrl");
-		log.info("isLookupUrl: {}", isLookupUrl);
-
 		final String format = parser.get("format");
 		log.info("format: {}", format);

-		final Integer batchSize = parser.getObjectMap().containsKey("batchSize")
-			? Integer.valueOf(parser.get("batchSize"))
-			: DEFAULT_BATCH_SIZE;
+		final String outputPath = Optional
+			.ofNullable(parser.get("outputPath"))
+			.map(StringUtils::trim)
+			.orElse(null);
+		log.info("outputPath: {}", outputPath);
+
+		final Integer batchSize = Optional
+			.ofNullable(parser.get("batchSize"))
+			.map(Integer::valueOf)
+			.orElse(DEFAULT_BATCH_SIZE);
 		log.info("batchSize: {}", batchSize);

-		final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
-		final String fields = getLayoutSource(isLookup, format);
+		final OutputFormat outputFormat = Optional
+			.ofNullable(parser.get("outputFormat"))
+			.map(OutputFormat::valueOf)
+			.orElse(OutputFormat.SOLR);
+		log.info("outputFormat: {}", outputFormat);
+
+		final SparkConf conf = new SparkConf();
+		conf.registerKryoClasses(new Class[] {
+			SerializableSolrInputDocument.class
+		});
+
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				final String isLookupUrl = parser.get("isLookupUrl");
+				log.info("isLookupUrl: {}", isLookupUrl);
+				final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
+				new XmlIndexingJob(spark, inputPath, format, batchSize, outputFormat, outputPath).run(isLookup);
+			});
+	}
+
+	public XmlIndexingJob(SparkSession spark, String inputPath, String format, Integer batchSize,
+		OutputFormat outputFormat,
+		String outputPath) {
+		this.spark = spark;
+		this.inputPath = inputPath;
+		this.format = format;
+		this.batchSize = batchSize;
+		this.outputFormat = outputFormat;
+		this.outputPath = outputPath;
+	}
+
+	public void run(ISLookupClient isLookup) throws ISLookUpException, TransformerException {
+		final String fields = isLookup.getLayoutSource(format);
 		log.info("fields: {}", fields);

-		final String xslt = getLayoutTransformer(isLookup);
+		final String xslt = isLookup.getLayoutTransformer();

-		final String dsId = getDsId(format, isLookup);
+		final String dsId = isLookup.getDsId(format);
 		log.info("dsId: {}", dsId);

-		final String zkHost = getZkHost(isLookup);
+		final String zkHost = isLookup.getZkHost();
 		log.info("zkHost: {}", zkHost);

 		final String version = getRecordDatestamp();
@ -88,24 +147,31 @@ public class XmlIndexingJob extends SolrApplication {
 		final String indexRecordXslt = getLayoutTransformer(format, fields, xslt);
 		log.info("indexRecordTransformer {}", indexRecordXslt);

-		final SparkConf conf = new SparkConf();
+		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());

-		runWithSparkSession(
-			conf,
-			isSparkSessionManaged,
-			spark -> {
-				final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+		JavaRDD<SolrInputDocument> docs = sc
+			.sequenceFile(inputPath, Text.class, Text.class)
+			.map(t -> t._2().toString())
+			.map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s))
+			.map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s));

-				RDD<SolrInputDocument> docs = sc
-					.sequenceFile(inputPath, Text.class, Text.class)
-					.map(t -> t._2().toString())
-					.map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s))
-					.map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s))
-					.rdd();
-
-				final String collection = format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
-				SolrSupport.indexDocs(zkHost, collection, batchSize, docs);
-			});
+		switch (outputFormat) {
+			case SOLR:
+				final String collection = ProvisionConstants.getCollectionName(format);
+				SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
+				break;
+			case HDFS:
+				spark
+					.createDataset(
+						docs.map(s -> new SerializableSolrInputDocument(s)).rdd(),
+						Encoders.kryo(SerializableSolrInputDocument.class))
+					.write()
+					.mode(SaveMode.Overwrite)
+					.parquet(outputPath);
+				break;
+			default:
+				throw new IllegalArgumentException("invalid outputFormat: " + outputFormat);
+		}
 	}

 	protected static String toIndexRecord(Transformer tr, final String record) {
@ -151,56 +217,4 @@ public class XmlIndexingJob extends SolrApplication {
 		return new SimpleDateFormat(DATE_FORMAT).format(new Date());
 	}

-	/**
-	 * Method retrieves from the information system the list of fields associated to the given MDFormat name
-	 *
-	 * @param isLookup the ISLookup service stub
-	 * @param format the Metadata format name
-	 * @return the string representation of the list of fields to be indexed
-	 * @throws ISLookUpDocumentNotFoundException
-	 * @throws ISLookUpException
-	 */
-	private static String getLayoutSource(final ISLookUpService isLookup, final String format)
-		throws ISLookUpDocumentNotFoundException, ISLookUpException {
-		return doLookup(
-			isLookup,
-			String
-				.format(
-					"collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']",
-					format, LAYOUT));
-	}
-
-	/**
-	 * Method retrieves from the information system the openaireLayoutToRecordStylesheet
-	 *
-	 * @param isLookup the ISLookup service stub
-	 * @return the string representation of the XSLT contained in the transformation rule profile
-	 * @throws ISLookUpDocumentNotFoundException
-	 * @throws ISLookUpException
-	 */
-	private static String getLayoutTransformer(ISLookUpService isLookup) throws ISLookUpException {
-		return doLookup(
-			isLookup,
-			"collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')"
-				+ "//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()");
-	}
-
-	/**
-	 * Method retrieves from the information system the IndexDS profile ID associated to the given MDFormat name
-	 *
-	 * @param format
-	 * @param isLookup
-	 * @return the IndexDS identifier
-	 * @throws ISLookUpException
-	 */
-	private static String getDsId(String format, ISLookUpService isLookup) throws ISLookUpException {
-		return doLookup(
-			isLookup,
-			String
-				.format(
-					"collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')"
-						+ "//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()",
-					format));
-	}
-
 }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SerializableSolrInputDocument.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SerializableSolrInputDocument.java
@ -0,0 +1,23 @@
+
+package eu.dnetlib.dhp.oa.provision.model;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.SolrInputField;
+
+/**
+ * Wrapper class needed to make the SolrInputDocument compatible with the Kryo serialization mechanism.
+ */
+public class SerializableSolrInputDocument extends SolrInputDocument {
+
+	public SerializableSolrInputDocument() {
+		super(new HashMap<>());
+	}
+
+	public SerializableSolrInputDocument(Map<String, SolrInputField> fields) {
+		super(fields);
+	}
+
+}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ISLookupClient.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ISLookupClient.java
@ -0,0 +1,95 @@
+
+package eu.dnetlib.dhp.oa.provision.utils;
+
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.oa.provision.ProvisionConstants;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+
+public class ISLookupClient {
+
+	private static final Logger log = LoggerFactory.getLogger(ISLookupClient.class);
+
+	private ISLookUpService isLookup;
+
+	public ISLookupClient(ISLookUpService isLookup) {
+		this.isLookup = isLookup;
+	}
+
+	/**
+	 * Method retrieves from the information system the list of fields associated to the given MDFormat name
+	 *
+	 * @param format the Metadata format name
+	 * @return the string representation of the list of fields to be indexed
+	 * @throws ISLookUpDocumentNotFoundException
+	 * @throws ISLookUpException
+	 */
+	public String getLayoutSource(final String format)
+		throws ISLookUpDocumentNotFoundException, ISLookUpException {
+		return doLookup(
+			String
+				.format(
+					"collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']",
+					format, ProvisionConstants.LAYOUT));
+	}
+
+	/**
+	 * Method retrieves from the information system the openaireLayoutToRecordStylesheet
+	 *
+	 * @return the string representation of the XSLT contained in the transformation rule profile
+	 * @throws ISLookUpDocumentNotFoundException
+	 * @throws ISLookUpException
+	 */
+	public String getLayoutTransformer() throws ISLookUpException {
+		return doLookup(
+			"collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')"
+				+ "//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()");
+	}
+
+	/**
+	 * Method retrieves from the information system the IndexDS profile ID associated to the given MDFormat name
+	 *
+	 * @param format
+	 * @return the IndexDS identifier
+	 * @throws ISLookUpException
+	 */
+	public String getDsId(String format) throws ISLookUpException {
+		return doLookup(
+			String
+				.format(
+					"collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')"
+						+ "//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()",
+					format));
+	}
+
+	/**
+	 * Method retrieves from the information system the zookeeper quorum of the Solr server
+	 *
+	 * @return the zookeeper quorum of the Solr server
+	 * @throws ISLookUpException
+	 */
+	public String getZkHost() throws ISLookUpException {
+		return doLookup(
+			"for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()");
+	}
+
+	private String doLookup(String xquery) throws ISLookUpException {
+		log.info(String.format("running xquery: %s", xquery));
+		final String res = getIsLookup().getResourceProfileByQuery(xquery);
+		log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ..."));
+		return res;
+	}
+
+	public ISLookUpService getIsLookup() {
+		return isLookup;
+	}
+
+	public void setIsLookup(ISLookUpService isLookup) {
+		this.isLookup = isLookup;
+	}
+
+}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java
@ -46,11 +46,6 @@ public class StreamingInputDocumentFactory {

 	private static final String INDEX_RECORD_ID = INDEX_FIELD_PREFIX + "indexrecordidentifier";

-	private static final String outFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'";
-
-	private static final List<String> dateFormats = Arrays
-		.asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy");
-
 	private static final String DEFAULTDNETRESULT = "dnetResult";

 	private static final String TARGETFIELDS = "targetFields";
@ -125,13 +120,12 @@ public class StreamingInputDocumentFactory {
 			}

 			if (!indexDocument.containsKey(INDEX_RECORD_ID)) {
-				indexDocument.clear();
-				System.err.println("missing indexrecord id:\n" + inputDocument);
+				throw new IllegalStateException("cannot extract record ID from: " + inputDocument);
 			}

 			return indexDocument;
 		} catch (XMLStreamException e) {
-			return new SolrInputDocument();
+			throw new IllegalStateException(e);
 		}
 	}

--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
@ -901,28 +901,6 @@ public class XmlRecordFactory implements Serializable {
 				if (p.getEcsc39() != null) {
 					metadata.add(XmlSerializationUtils.asXmlElement("ecsc39", p.getEcsc39().getValue()));
 				}
-				if (p.getContactfullname() != null) {
-					metadata
-						.add(
-							XmlSerializationUtils
-								.asXmlElement(
-									"contactfullname", p.getContactfullname().getValue()));
-				}
-				if (p.getContactfax() != null) {
-					metadata
-						.add(
-							XmlSerializationUtils.asXmlElement("contactfax", p.getContactfax().getValue()));
-				}
-				if (p.getContactphone() != null) {
-					metadata
-						.add(
-							XmlSerializationUtils.asXmlElement("contactphone", p.getContactphone().getValue()));
-				}
-				if (p.getContactemail() != null) {
-					metadata
-						.add(
-							XmlSerializationUtils.asXmlElement("contactemail", p.getContactemail().getValue()));
-				}
 				if (p.getSummary() != null) {
 					metadata.add(XmlSerializationUtils.asXmlElement("summary", p.getSummary().getValue()));
 				}
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_update_index.json
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_update_index.json
@ -22,5 +22,17 @@
    "paramLongName": "batchSize",
    "paramDescription": "size of the batch of documents sent to solr",
    "paramRequired": false
+  },
+  {
+    "paramName": "of",
+    "paramLongName": "outputFormat",
+    "paramDescription": "decides the job output format, SOLR | HDFS",
+    "paramRequired": false
+  },
+  {
+    "paramName": "op",
+    "paramLongName": "outputPath",
+    "paramDescription": "path on hdfs activating an alternative output for the SolrInputDocuments",
+    "paramRequired": false
  }
 ]
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@ -42,6 +42,7 @@
            <value>*:*</value>
            <description>query used in the deleted by query operation</description>
        </property>
+
        <property>
            <name>sparkDriverMemoryForJoining</name>
            <description>memory for driver process</description>
@ -638,6 +639,8 @@
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
            <arg>--format</arg><arg>${format}</arg>
            <arg>--batchSize</arg><arg>${batchSize}</arg>
+            <arg>--outputFormat</arg><arg>${outputFormat}</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/solr_documents</arg>
        </spark>
        <ok to="commit_solr_collection"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplicationTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplicationTest.java
@ -1,107 +1,18 @@

 package eu.dnetlib.dhp.oa.provision;

-import java.io.File;
-import java.nio.file.Path;
-
-import org.apache.solr.client.solrj.SolrResponse;
-import org.apache.solr.client.solrj.embedded.JettyConfig;
-import org.apache.solr.client.solrj.impl.CloudSolrClient;
-import org.apache.solr.client.solrj.impl.XMLResponseParser;
-import org.apache.solr.client.solrj.request.CollectionAdminRequest;
-import org.apache.solr.client.solrj.request.ConfigSetAdminRequest;
-import org.apache.solr.client.solrj.request.QueryRequest;
-import org.apache.solr.client.solrj.request.RequestWriter;
-import org.apache.solr.client.solrj.response.CollectionAdminResponse;
-import org.apache.solr.client.solrj.response.ConfigSetAdminResponse;
 import org.apache.solr.client.solrj.response.SolrPingResponse;
 import org.apache.solr.client.solrj.response.UpdateResponse;
-import org.apache.solr.cloud.MiniSolrCloudCluster;
-import org.apache.solr.common.params.CollectionParams;
-import org.apache.solr.common.params.CoreAdminParams;
-import org.apache.solr.common.params.ModifiableSolrParams;
-import org.apache.solr.common.util.NamedList;
-import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
-import org.junit.jupiter.api.io.TempDir;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;

 import junit.framework.Assert;

-public class SolrAdminApplicationTest {
-
-	private static final Logger log = LoggerFactory.getLogger(SolrAdminApplicationTest.class);
-	public static final String DEFAULT_COLLECTION = "testCollection";
-	public static final String CONFIG_NAME = "testConfig";
-
-	private static MiniSolrCloudCluster miniCluster;
-	private static CloudSolrClient cloudSolrClient;
-
-	@TempDir
-	public static Path tempDir;
-
-	@BeforeAll
-	public static void setup() throws Exception {
-
-		// random unassigned HTTP port
-		final int jettyPort = 0;
-
-		final JettyConfig jettyConfig = JettyConfig.builder().setPort(jettyPort).build();
-
-		// create a MiniSolrCloudCluster instance
-		miniCluster = new MiniSolrCloudCluster(2, tempDir, jettyConfig);
-
-		// Upload Solr configuration directory to ZooKeeper
-		String solrZKConfigDir = "src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig";
-		File configDir = new File(solrZKConfigDir);
-
-		miniCluster.uploadConfigSet(configDir.toPath(), CONFIG_NAME);
-
-		// override settings in the solrconfig include
-		System.setProperty("solr.tests.maxBufferedDocs", "100000");
-		System.setProperty("solr.tests.maxIndexingThreads", "-1");
-		System.setProperty("solr.tests.ramBufferSizeMB", "100");
-
-		// use non-test classes so RandomizedRunner isn't necessary
-		System.setProperty("solr.tests.mergeScheduler", "org.apache.lucene.index.ConcurrentMergeScheduler");
-		System.setProperty("solr.directoryFactory", "solr.RAMDirectoryFactory");
-
-		cloudSolrClient = miniCluster.getSolrClient();
-		cloudSolrClient.setRequestWriter(new RequestWriter());
-		cloudSolrClient.setParser(new XMLResponseParser());
-		cloudSolrClient.setDefaultCollection(DEFAULT_COLLECTION);
-		cloudSolrClient.connect();
-
-		log.info(new ConfigSetAdminRequest.List().process(cloudSolrClient).toString());
-		log.info(CollectionAdminRequest.ClusterStatus.getClusterStatus().process(cloudSolrClient).toString());
-
-		createCollection(cloudSolrClient, DEFAULT_COLLECTION, 2, 1, CONFIG_NAME);
-	}
-
-	@AfterAll
-	public static void shutDown() throws Exception {
-		miniCluster.shutdown();
-	}
-
-	protected static NamedList<Object> createCollection(CloudSolrClient client, String name, int numShards,
-		int replicationFactor, String configName) throws Exception {
-		ModifiableSolrParams modParams = new ModifiableSolrParams();
-		modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name());
-		modParams.set("name", name);
-		modParams.set("numShards", numShards);
-		modParams.set("replicationFactor", replicationFactor);
-		modParams.set("collection.configName", configName);
-		QueryRequest request = new QueryRequest(modParams);
-		request.setPath("/admin/collections");
-		return client.request(request);
-	}
+public class SolrAdminApplicationTest extends SolrTest {

 	@Test
 	public void testPing() throws Exception {
-		SolrPingResponse pingResponse = cloudSolrClient.ping();
+		SolrPingResponse pingResponse = miniCluster.getSolrClient().ping();
 		log.info("pingResponse: '{}'", pingResponse.getStatus());
 		Assert.assertTrue(pingResponse.getStatus() == 0);
 	}
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrTest.java
@ -0,0 +1,109 @@
+
+package eu.dnetlib.dhp.oa.provision;
+
+import java.io.File;
+import java.nio.file.Path;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.solr.client.solrj.embedded.JettyConfig;
+import org.apache.solr.client.solrj.impl.CloudSolrClient;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.ConfigSetAdminRequest;
+import org.apache.solr.client.solrj.request.QueryRequest;
+import org.apache.solr.cloud.MiniSolrCloudCluster;
+import org.apache.solr.common.params.CollectionParams;
+import org.apache.solr.common.params.CoreAdminParams;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.util.NamedList;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.io.TempDir;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public abstract class SolrTest {
+
+	protected static final Logger log = LoggerFactory.getLogger(SolrTest.class);
+
+	protected static final String FORMAT = "test";
+	protected static final String DEFAULT_COLLECTION = FORMAT + "-index-openaire";
+	protected static final String CONFIG_NAME = "testConfig";
+
+	protected static MiniSolrCloudCluster miniCluster;
+
+	@TempDir
+	public static Path workingDir;
+
+	@BeforeAll
+	public static void setup() throws Exception {
+
+		// random unassigned HTTP port
+		final int jettyPort = 0;
+		final JettyConfig jettyConfig = JettyConfig.builder().setPort(jettyPort).build();
+
+		log.info(String.format("working directory: %s", workingDir.toString()));
+		System.setProperty("solr.log.dir", workingDir.resolve("logs").toString());
+
+		// create a MiniSolrCloudCluster instance
+		miniCluster = new MiniSolrCloudCluster(2, workingDir.resolve("solr"), jettyConfig);
+
+		// Upload Solr configuration directory to ZooKeeper
+		String solrZKConfigDir = "src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig";
+		File configDir = new File(solrZKConfigDir);
+
+		miniCluster.uploadConfigSet(configDir.toPath(), CONFIG_NAME);
+
+		// override settings in the solrconfig include
+		System.setProperty("solr.tests.maxBufferedDocs", "100000");
+		System.setProperty("solr.tests.maxIndexingThreads", "-1");
+		System.setProperty("solr.tests.ramBufferSizeMB", "100");
+
+		// use non-test classes so RandomizedRunner isn't necessary
+		System.setProperty("solr.tests.mergeScheduler", "org.apache.lucene.index.ConcurrentMergeScheduler");
+		System.setProperty("solr.directoryFactory", "solr.RAMDirectoryFactory");
+		System.setProperty("solr.lock.type", "single");
+
+		log.info(new ConfigSetAdminRequest.List().process(miniCluster.getSolrClient()).toString());
+		log
+			.info(
+				CollectionAdminRequest.ClusterStatus
+					.getClusterStatus()
+					.process(miniCluster.getSolrClient())
+					.toString());
+
+		NamedList<Object> res = createCollection(
+			miniCluster.getSolrClient(), DEFAULT_COLLECTION, 4, 2, 20, CONFIG_NAME);
+		res.forEach(o -> log.info(o.toString()));
+
+		miniCluster.getSolrClient().setDefaultCollection(DEFAULT_COLLECTION);
+
+		log
+			.info(
+				CollectionAdminRequest.ClusterStatus
+					.getClusterStatus()
+					.process(miniCluster.getSolrClient())
+					.toString());
+
+	}
+
+	@AfterAll
+	public static void shutDown() throws Exception {
+		miniCluster.shutdown();
+		FileUtils.deleteDirectory(workingDir.toFile());
+	}
+
+	protected static NamedList<Object> createCollection(CloudSolrClient client, String name, int numShards,
+		int replicationFactor, int maxShardsPerNode, String configName) throws Exception {
+		ModifiableSolrParams modParams = new ModifiableSolrParams();
+		modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name());
+		modParams.set("name", name);
+		modParams.set("numShards", numShards);
+		modParams.set("replicationFactor", replicationFactor);
+		modParams.set("collection.configName", configName);
+		modParams.set("maxShardsPerNode", maxShardsPerNode);
+		QueryRequest request = new QueryRequest(modParams);
+		request.setPath("/admin/collections");
+		return client.request(request);
+	}
+
+}
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJobTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJobTest.java
@ -0,0 +1,149 @@
+
+package eu.dnetlib.dhp.oa.provision;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.net.URI;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrInputField;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.dom4j.io.SAXReader;
+import org.junit.jupiter.api.*;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.junit.jupiter.MockitoExtension;
+
+import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
+import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+
+@ExtendWith(MockitoExtension.class)
+public class XmlIndexingJobTest extends SolrTest {
+
+	protected static SparkSession spark;
+
+	private static final Integer batchSize = 100;
+
+	@Mock
+	private ISLookUpService isLookUpService;
+
+	@Mock
+	private ISLookupClient isLookupClient;
+
+	@BeforeEach
+	public void prepareMocks() throws ISLookUpException, IOException {
+		isLookupClient.setIsLookup(isLookUpService);
+
+		int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort();
+
+		Mockito
+			.when(isLookupClient.getDsId(Mockito.anyString()))
+			.thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
+		Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort));
+		Mockito
+			.when(isLookupClient.getLayoutSource(Mockito.anyString()))
+			.thenReturn(IOUtils.toString(getClass().getResourceAsStream("fields.xml")));
+		Mockito
+			.when(isLookupClient.getLayoutTransformer())
+			.thenReturn(IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl")));
+	}
+
+	@BeforeAll
+	public static void before() {
+
+		SparkConf conf = new SparkConf();
+		conf.setAppName(XmlIndexingJobTest.class.getSimpleName());
+		conf.registerKryoClasses(new Class[] {
+			SerializableSolrInputDocument.class
+		});
+
+		conf.setMaster("local[1]");
+		conf.set("spark.driver.host", "localhost");
+		conf.set("hive.metastore.local", "true");
+		conf.set("spark.ui.enabled", "false");
+		conf.set("spark.sql.warehouse.dir", workingDir.resolve("spark").toString());
+
+		spark = SparkSession
+			.builder()
+			.appName(XmlIndexingJobTest.class.getSimpleName())
+			.config(conf)
+			.getOrCreate();
+	}
+
+	@AfterAll
+	public static void tearDown() {
+		spark.stop();
+	}
+
+	@Test
+	public void testXmlIndexingJob_onSolr() throws Exception {
+
+		String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml";
+
+		long nRecord = JavaSparkContext
+			.fromSparkContext(spark.sparkContext())
+			.sequenceFile(inputPath, Text.class, Text.class)
+			.count();
+
+		new XmlIndexingJob(spark, inputPath, FORMAT, batchSize, XmlIndexingJob.OutputFormat.SOLR, null)
+			.run(isLookupClient);
+
+		Assertions.assertEquals(0, miniCluster.getSolrClient().commit().getStatus());
+
+		QueryResponse rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "*:*"));
+
+		Assertions
+			.assertEquals(
+				nRecord, rsp.getResults().getNumFound(),
+				"the number of indexed records should be equal to the number of input records");
+	}
+
+	@Test
+	public void testXmlIndexingJob_saveOnHDFS() throws Exception {
+		final String ID_XPATH = "//header/*[local-name()='objIdentifier']";
+
+		String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml";
+
+		final JavaPairRDD<Text, Text> xmlRecords = JavaSparkContext
+			.fromSparkContext(spark.sparkContext())
+			.sequenceFile(inputPath, Text.class, Text.class);
+		long nRecord = xmlRecords.count();
+		long xmlIdUnique = xmlRecords
+			.map(t -> t._2().toString())
+			.map(s -> new SAXReader().read(new StringReader(s)).valueOf(ID_XPATH))
+			.distinct()
+			.count();
+		Assertions.assertEquals(nRecord, xmlIdUnique, "IDs should be unique among input records");
+
+		final String outputPath = workingDir.resolve("outputPath").toAbsolutePath().toString();
+		new XmlIndexingJob(spark, inputPath, FORMAT, batchSize, XmlIndexingJob.OutputFormat.HDFS, outputPath)
+			.run(isLookupClient);
+
+		final Dataset<SerializableSolrInputDocument> solrDocs = spark
+			.read()
+			.load(outputPath)
+			.as(Encoders.kryo(SerializableSolrInputDocument.class));
+		long docIdUnique = solrDocs.map((MapFunction<SerializableSolrInputDocument, String>) doc -> {
+			final SolrInputField id = doc.getField("__indexrecordidentifier");
+			return id.getFirstValue().toString();
+		}, Encoders.STRING())
+			.distinct()
+			.count();
+		Assertions.assertEquals(xmlIdUnique, docIdUnique, "IDs should be unique among the output records");
+
+	}
+
+}
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml
@ -105,7 +105,7 @@
        <FIELD indexable="true" name="relorganizationname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalname)"/>
        <FIELD indexable="true" name="relorganizationshortname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalshortname)"/>
        <FIELD indexable="true" name="relresultid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='result'])"/>
-        <FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/resulttype/@classid)"/>
+        <FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@type)"/>
        <FIELD indexable="true" name="relclass" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@class)"/>
        <FIELD indexable="true" name="relfundinglevel0_id" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']//rel/funding/funding_level_0"/>
        <FIELD indexable="true" name="relfundinglevel0_name" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']//rel/funding/funding_level_0/@name/string()"/>
@ -123,7 +123,8 @@
        <FIELD indexable="true" name="relfundername" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']//rel/funding/funder/@name)"/>
        <FIELD indexable="true" name="relfunderjurisdiction" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']//rel/funding/funder/@jurisdiction)"/><!-- Collected from of the related entity. Available for result-result relationships -->
        <FIELD indexable="true" name="relcollectedfromid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/collectedfrom/@id)"/>
-        <FIELD indexable="true" name="relcollectedfromname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/collectedfrom/@name)"/><!-- COMMON FIELDS -->
+        <FIELD indexable="true" name="relcollectedfromname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/collectedfrom/@name)"/>
+        <FIELD indexable="true" name="semrelid" result="false" stat="false" tokenizable="false" value="concat(./to/text(), '||', ./to/@class/string())" xpath="//*[local-name()='entity']//rel"/><!-- COMMON FIELDS -->
        <FIELD indexable="true" multivalued="false" name="dateofcollection" result="false" stat="false" type="pdate" value="//header/*[local-name()='dateOfCollection']"/>
        <FIELD indexable="true" name="collectedfrom" result="false" stat="false" tokenizable="false" value="distinct-values(concat(./@id, '||', ./@name))" xpath="//*[local-name()='entity']/*/*[local-name()='collectedfrom'] | //*[local-name()='entity']/*//*[local-name() = 'instance']/*[local-name()='collectedfrom']"/>
        <FIELD indexable="true" name="collectedfromdatasourceid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/*[local-name()='collectedfrom']/@id | //*[local-name()='entity']/*//*[local-name() = 'instance']/*[local-name()='collectedfrom']/@id)"/>
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/elevate.xml
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/elevate.xml
@ -0,0 +1,31 @@
+Unless required by applicable law or agreed to in writing, software
+        distributed under the License is distributed on an "AS IS" BASIS,
+        WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+        See the License for the specific language governing permissions and
+        limitations under the License.
+        -->
+
+        <!-- If this file is found in the config directory, it will only be
+             loaded once at startup.  If it is found in Solr's data
+             directory, it will be re-loaded every commit.
+
+           See http://wiki.apache.org/solr/QueryElevationComponent for more info
+
+        -->
+<elevate>
+    <!-- Query elevation examples
+     <query text="foo bar">
+       <doc id="1" />
+       <doc id="2" />
+       <doc id="3" />
+     </query>
+
+   for use with techproducts example
+
+     <query text="ipod">
+       <doc id="MA147LL/A" />  put the actual ipod at the top
+       <doc id="IW-02" exclude="true" /> exclude this cable
+     </query>
+   -->
+
+</elevate>
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/managed-schema
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/managed-schema
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/solrconfig.xml
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/solrconfig.xml
@ -83,6 +83,7 @@

  <lib dir="${solr.install.dir:../../../..}/contrib/velocity/lib" regex=".*\.jar" />
  <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-velocity-\d.*\.jar" />
+
  <!-- an exact 'path' can be used instead of a 'dir' to specify a
       specific jar file.  This will cause a serious error to be logged
       if it can't be loaded.
@ -112,7 +113,8 @@
       One can force a particular implementation via solr.MMapDirectoryFactory,
       solr.NIOFSDirectoryFactory, or solr.SimpleFSDirectoryFactory.

-       solr.RAMDirectoryFactory is memory based and not persistent.
+       solr.RAMDirectoryFactory is memory based, not
+       persistent, and doesn't work with replication.
    -->
  <directoryFactory name="DirectoryFactory"
                    class="${solr.directoryFactory:solr.NRTCachingDirectoryFactory}"/>
@ -204,7 +206,7 @@
         More details on the nuances of each LockFactory...
         http://wiki.apache.org/lucene-java/AvailableLockFactories
    -->
-    <lockType>${solr.lock.type:single}</lockType>
+    <lockType>${solr.lock.type:native}</lockType>

    <!-- Commit Deletion Policy
         Custom deletion policies can be specified here. The class must
@ -331,6 +333,29 @@
         postCommit - fired after every commit or optimize command
         postOptimize - fired after every optimize command
      -->
+    <!-- The RunExecutableListener executes an external command from a
+         hook such as postCommit or postOptimize.
+
+         exe - the name of the executable to run
+         dir - dir to use as the current working directory. (default=".")
+         wait - the calling thread waits until the executable returns.
+                (default="true")
+         args - the arguments to pass to the program.  (default is none)
+         env - environment variables to set.  (default is none)
+      -->
+    <!-- This example shows how RunExecutableListener could be used
+         with the script based replication...
+         http://wiki.apache.org/solr/CollectionDistribution
+      -->
+    <!--
+       <listener event="postCommit" class="solr.RunExecutableListener">
+         <str name="exe">solr/bin/snapshooter</str>
+         <str name="dir">.</str>
+         <bool name="wait">true</bool>
+         <arr name="args"> <str>arg1</str> <str>arg2</str> </arr>
+         <arr name="env"> <str>MYVAR=val1</str> </arr>
+       </listener>
+      -->

  </updateHandler>

@ -366,14 +391,22 @@
       Query section - these settings control query time things like caches
       ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
  <query>
+    <!-- Max Boolean Clauses
+
+         Maximum number of clauses in each BooleanQuery,  an exception
+         is thrown if exceeded.
+
+         ** WARNING **
+
+         This option actually modifies a global Lucene property that
+         will affect all SolrCores.  If multiple solrconfig.xml files
+         disagree on this property, the value at any given moment will
+         be based on the last SolrCore to be initialized.

-    <!-- Maximum number of clauses in each BooleanQuery,  an exception
-         is thrown if exceeded.  It is safe to increase or remove this setting,
-         since it is purely an arbitrary limit to try and catch user errors where
-         large boolean queries may not be the best implementation choice.
      -->
    <maxBooleanClauses>1024</maxBooleanClauses>

+
    <!-- Solr Internal Query Caches

         There are two implementations of cache available for Solr,
@ -575,8 +608,21 @@
       This section contains instructions for how the SolrDispatchFilter
       should behave when processing requests for this SolrCore.

+       handleSelect is a legacy option that affects the behavior of requests
+       such as /select?qt=XXX
+
+       handleSelect="true" will cause the SolrDispatchFilter to process
+       the request and dispatch the query to a handler specified by the
+       "qt" param, assuming "/select" isn't already registered.
+
+       handleSelect="false" will cause the SolrDispatchFilter to
+       ignore "/select" requests, resulting in a 404 unless a handler
+       is explicitly registered with the name "/select"
+
+       handleSelect="true" is not recommended for new users, but is the default
+       for backwards compatibility
    -->
-  <requestDispatcher>
+  <requestDispatcher handleSelect="false" >
    <!-- Request Parsing

         These settings indicate how Solr Requests may be parsed, and
@ -602,14 +648,15 @@
         plugins.

         *** WARNING ***
-         Before enabling remote streaming, you should make sure your
-         system has authentication enabled.
+         The settings below authorize Solr to fetch remote files, You
+         should make sure your system has some authentication before
+         using enableRemoteStreaming="true"

-    <requestParsers enableRemoteStreaming="false"
-                    multipartUploadLimitInKB="-1"
-                    formdataUploadLimitInKB="-1"
-                    addHttpRequestToContext="false"/>
      -->
+    <requestParsers enableRemoteStreaming="true"
+                    multipartUploadLimitInKB="2048000"
+                    formdataUploadLimitInKB="2048"
+                    addHttpRequestToContext="false"/>

    <!-- HTTP Caching

@ -673,6 +720,14 @@
       Incoming queries will be dispatched to a specific handler by name
       based on the path specified in the request.

+       Legacy behavior: If the request path uses "/select" but no Request
+       Handler has that name, and if handleSelect="true" has been specified in
+       the requestDispatcher, then the Request Handler is dispatched based on
+       the qt parameter.  Handlers without a leading '/' are accessed this way
+       like so: http://host/app/[core/]select?qt=name  If no qt is
+       given, then the requestHandler that declares default="true" will be
+       used or the one named "standard".
+
       If a Request Handler is declared with startup="lazy", then it will
       not be initialized until the first request that uses it.

@ -692,13 +747,9 @@
      -->
    <lst name="defaults">
      <str name="echoParams">explicit</str>
+      <str name="q.op">AND</str>
      <int name="rows">10</int>
-      <!-- Default search field
-         <str name="df">text</str> 
-        -->
-      <!-- Change from JSON to XML format (the default prior to Solr 7.0)
-         <str name="wt">xml</str> 
-        -->
+      <!-- <str name="df">text</str> -->
    </lst>
    <!-- In addition to defaults, "appends" params can be specified
         to identify values which should be appended to the list of
@ -781,10 +832,18 @@

  <initParams path="/update/**,/query,/select,/tvrh,/elevate,/spell,/browse">
    <lst name="defaults">
-      <str name="df">_text_</str>
+      <str name="df">__all</str>
    </lst>
  </initParams>

+  <!-- This enabled schemaless mode
+  <initParams path="/update/**">
+    <lst name="defaults">
+      <str name="update.chain">add-unknown-fields-to-the-schema</str>
+    </lst>
+  </initParams>
+  -->
+
  <!-- Solr Cell Update Request Handler

       http://wiki.apache.org/solr/ExtractingRequestHandler
@ -796,10 +855,9 @@
    <lst name="defaults">
      <str name="lowernames">true</str>
      <str name="fmap.meta">ignored_</str>
-      <str name="fmap.content">_text_</str>
+      <str name="fmap.content">__all</str>
    </lst>
  </requestHandler>
-
  <!-- Search Components

       Search components are registered to SolrCore and used by
@ -861,7 +919,7 @@
    <!-- a spellchecker built from a field of the main index -->
    <lst name="spellchecker">
      <str name="name">default</str>
-      <str name="field">_text_</str>
+      <str name="field">__all</str>
      <str name="classname">solr.DirectSolrSpellChecker</str>
      <!-- the spellcheck distance measure used, the default is the internal levenshtein -->
      <str name="distanceMeasure">internal</str>
@ -986,6 +1044,7 @@
  <searchComponent name="elevator" class="solr.QueryElevationComponent" >
    <!-- pick a fieldType to analyze queries -->
    <str name="queryFieldType">string</str>
+    <str name="config-file">elevate.xml</str>
  </searchComponent>

  <!-- A request handler for demonstrating the elevator component -->
@ -1116,81 +1175,70 @@

  <!-- Add unknown fields to the schema

-       Field type guessing update processors that will
+       An example field type guessing update processor that will
       attempt to parse string-typed field values as Booleans, Longs,
       Doubles, or Dates, and then add schema fields with the guessed
-       field types. Text content will be indexed as "text_general" as
-       well as a copy to a plain string version in *_str.
+       field types.

-       These require that the schema is both managed and mutable, by
+       This requires that the schema is both managed and mutable, by
       declaring schemaFactory as ManagedIndexSchemaFactory, with
       mutable specified as true.

       See http://wiki.apache.org/solr/GuessingFieldTypes
    -->
-  <updateProcessor class="solr.UUIDUpdateProcessorFactory" name="uuid"/>
-  <updateProcessor class="solr.RemoveBlankFieldUpdateProcessorFactory" name="remove-blank"/>
-  <updateProcessor class="solr.FieldNameMutatingUpdateProcessorFactory" name="field-name-mutating">
-    <str name="pattern">[^\w-\.]</str>
-    <str name="replacement">_</str>
-  </updateProcessor>
-  <updateProcessor class="solr.ParseBooleanFieldUpdateProcessorFactory" name="parse-boolean"/>
-  <updateProcessor class="solr.ParseLongFieldUpdateProcessorFactory" name="parse-long"/>
-  <updateProcessor class="solr.ParseDoubleFieldUpdateProcessorFactory" name="parse-double"/>
-  <updateProcessor class="solr.ParseDateFieldUpdateProcessorFactory" name="parse-date">
-    <arr name="format">
-      <str>yyyy-MM-dd'T'HH:mm:ss.SSSZ</str>
-      <str>yyyy-MM-dd'T'HH:mm:ss,SSSZ</str>
-      <str>yyyy-MM-dd'T'HH:mm:ss.SSS</str>
-      <str>yyyy-MM-dd'T'HH:mm:ss,SSS</str>
-      <str>yyyy-MM-dd'T'HH:mm:ssZ</str>
-      <str>yyyy-MM-dd'T'HH:mm:ss</str>
-      <str>yyyy-MM-dd'T'HH:mmZ</str>
-      <str>yyyy-MM-dd'T'HH:mm</str>
-      <str>yyyy-MM-dd HH:mm:ss.SSSZ</str>
-      <str>yyyy-MM-dd HH:mm:ss,SSSZ</str>
-      <str>yyyy-MM-dd HH:mm:ss.SSS</str>
-      <str>yyyy-MM-dd HH:mm:ss,SSS</str>
-      <str>yyyy-MM-dd HH:mm:ssZ</str>
-      <str>yyyy-MM-dd HH:mm:ss</str>
-      <str>yyyy-MM-dd HH:mmZ</str>
-      <str>yyyy-MM-dd HH:mm</str>
-      <str>yyyy-MM-dd</str>
-    </arr>
-  </updateProcessor>
-  <updateProcessor class="solr.AddSchemaFieldsUpdateProcessorFactory" name="add-schema-fields">
-    <lst name="typeMapping">
-      <str name="valueClass">java.lang.String</str>
-      <str name="fieldType">text_general</str>
-      <lst name="copyField">
-        <str name="dest">*_str</str>
-        <int name="maxChars">256</int>
+  <updateRequestProcessorChain name="add-unknown-fields-to-the-schema">
+    <!-- UUIDUpdateProcessorFactory will generate an id if none is present in the incoming document -->
+    <processor class="solr.UUIDUpdateProcessorFactory" />
+    <processor class="solr.RemoveBlankFieldUpdateProcessorFactory"/>
+    <processor class="solr.FieldNameMutatingUpdateProcessorFactory">
+      <str name="pattern">[^\w-\.]</str>
+      <str name="replacement">_</str>
+    </processor>
+    <processor class="solr.ParseBooleanFieldUpdateProcessorFactory"/>
+    <processor class="solr.ParseLongFieldUpdateProcessorFactory"/>
+    <processor class="solr.ParseDoubleFieldUpdateProcessorFactory"/>
+    <processor class="solr.ParseDateFieldUpdateProcessorFactory">
+      <arr name="format">
+        <str>yyyy-MM-dd'T'HH:mm:ss.SSSZ</str>
+        <str>yyyy-MM-dd'T'HH:mm:ss,SSSZ</str>
+        <str>yyyy-MM-dd'T'HH:mm:ss.SSS</str>
+        <str>yyyy-MM-dd'T'HH:mm:ss,SSS</str>
+        <str>yyyy-MM-dd'T'HH:mm:ssZ</str>
+        <str>yyyy-MM-dd'T'HH:mm:ss</str>
+        <str>yyyy-MM-dd'T'HH:mmZ</str>
+        <str>yyyy-MM-dd'T'HH:mm</str>
+        <str>yyyy-MM-dd HH:mm:ss.SSSZ</str>
+        <str>yyyy-MM-dd HH:mm:ss,SSSZ</str>
+        <str>yyyy-MM-dd HH:mm:ss.SSS</str>
+        <str>yyyy-MM-dd HH:mm:ss,SSS</str>
+        <str>yyyy-MM-dd HH:mm:ssZ</str>
+        <str>yyyy-MM-dd HH:mm:ss</str>
+        <str>yyyy-MM-dd HH:mmZ</str>
+        <str>yyyy-MM-dd HH:mm</str>
+        <str>yyyy-MM-dd</str>
+      </arr>
+    </processor>
+    <processor class="solr.AddSchemaFieldsUpdateProcessorFactory">
+      <str name="defaultFieldType">strings</str>
+      <lst name="typeMapping">
+        <str name="valueClass">java.lang.Boolean</str>
+        <str name="fieldType">booleans</str>
      </lst>
-      <!-- Use as default mapping instead of defaultFieldType -->
-      <bool name="default">true</bool>
-    </lst>
-    <lst name="typeMapping">
-      <str name="valueClass">java.lang.Boolean</str>
-      <str name="fieldType">booleans</str>
-    </lst>
-    <lst name="typeMapping">
-      <str name="valueClass">java.util.Date</str>
-      <str name="fieldType">pdates</str>
-    </lst>
-    <lst name="typeMapping">
-      <str name="valueClass">java.lang.Long</str>
-      <str name="valueClass">java.lang.Integer</str>
-      <str name="fieldType">plongs</str>
-    </lst>
-    <lst name="typeMapping">
-      <str name="valueClass">java.lang.Number</str>
-      <str name="fieldType">pdoubles</str>
-    </lst>
-  </updateProcessor>
+      <lst name="typeMapping">
+        <str name="valueClass">java.util.Date</str>
+        <str name="fieldType">tdates</str>
+      </lst>
+      <lst name="typeMapping">
+        <str name="valueClass">java.lang.Long</str>
+        <str name="valueClass">java.lang.Integer</str>
+        <str name="fieldType">tlongs</str>
+      </lst>
+      <lst name="typeMapping">
+        <str name="valueClass">java.lang.Number</str>
+        <str name="fieldType">tdoubles</str>
+      </lst>
+    </processor>

-  <!-- The update.autoCreateFields property can be turned to false to disable schemaless mode -->
-  <updateRequestProcessorChain name="add-unknown-fields-to-the-schema" default="${update.autoCreateFields:true}"
-           processor="uuid,remove-blank,field-name-mutating,parse-boolean,parse-long,parse-double,parse-date,add-schema-fields">
    <processor class="solr.LogUpdateProcessorFactory"/>
    <processor class="solr.DistributedUpdateProcessorFactory"/>
    <processor class="solr.RunUpdateProcessorFactory"/>
@ -1313,7 +1361,7 @@

  <!-- Query Parsers

-       https://lucene.apache.org/solr/guide/query-syntax-and-parsing.html
+       https://cwiki.apache.org/confluence/display/solr/Query+Syntax+and+Parsing

       Multiple QParserPlugins can be registered by name, and then
       used in either the "defType" param for the QueryComponent (used
@ -1361,4 +1409,4 @@
      EditorialMarkerFactory will do exactly that:
     <transformer name="qecBooster" class="org.apache.solr.response.transform.EditorialMarkerFactory" />
    -->
-</config>
+</config>
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/xml/part-00000
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/xml/part-00000
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
@ -46,7 +46,7 @@
        </configuration>
    </global>

-    <start to="Step18"/>
+    <start to="Step1"/>

    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>