Merge pull request 'XML record indexing test' (#58) from provision_indexing into master

2020-11-18 17:04:34 +01:00 · 2020-11-18 17:04:34 +01:00 · ede7fae6c8
parent 12acf25519 5218718e8b
commit ede7fae6c8
19 changed files with 1066 additions and 1351 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java
@ -7,8 +7,6 @@ import static org.mockito.Mockito.lenient;
 import java.io.IOException;
 import java.util.List;
 import java.util.Set;
 import java.util.function.Predicate;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.commons.io.IOUtils;
@ -21,7 +19,10 @@ import org.mockito.junit.jupiter.MockitoExtension;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
-import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Qualifier;
 import eu.dnetlib.dhp.schema.oaf.Result;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
--- a/dhp-workflows/dhp-graph-provision/pom.xml
+++ b/dhp-workflows/dhp-graph-provision/pom.xml
@ -22,6 +22,12 @@
        <dependency>
            <groupId>com.jayway.jsonpath</groupId>
            <artifactId>json-path</artifactId>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-api</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
@ -82,9 +88,6 @@
                    <groupId>org.codehaus.woodstox</groupId>
                    <artifactId>*</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>com.github.ben-manes.caffeine</groupId>
                    <artifactId>*</artifactId>
@ -109,11 +112,10 @@
                    <groupId>org.apache.hadoop</groupId>
                    <artifactId>*</artifactId>
                </exclusion>
-
+                <exclusion>
-
+                    <groupId>org.apache.zookeeper</groupId>
-
+                    <artifactId>zookeeper</artifactId>
-
+                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/ProvisionConstants.java
@ -3,6 +3,10 @@ package eu.dnetlib.dhp.oa.provision;
 public class ProvisionConstants {
 	public static final String LAYOUT = "index";
 	public static final String INTERPRETATION = "openaire";
 	public static final String SEPARATOR = "-";
 	public static final int MAX_EXTERNAL_ENTITIES = 50;
 	public static final int MAX_AUTHORS = 200;
 	public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
@ -11,4 +15,8 @@ public class ProvisionConstants {
 	public static final int MAX_ABSTRACT_LENGTH = 100000;
 	public static final int MAX_INSTANCES = 10;
 	public static String getCollectionName(String format) {
 		return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
 	}
 }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java
@ -14,11 +14,12 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
 import eu.dnetlib.dhp.oa.provision.utils.ZkServers;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
-public class SolrAdminApplication extends SolrApplication implements Closeable {
+public class SolrAdminApplication implements Closeable {
 	private static final Logger log = LoggerFactory.getLogger(SolrAdminApplication.class);
@ -54,12 +55,12 @@ public class SolrAdminApplication extends SolrApplication implements Closeable {
 			.orElse(false);
 		log.info("commit: {}", commit);
-		final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
+		final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
-		final String zkHost = getZkHost(isLookup);
+		final String zkHost = isLookup.getZkHost();
 		log.info("zkHost: {}", zkHost);
-		final String collection = format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
+		final String collection = ProvisionConstants.getCollectionName(format);
 		log.info("collection: {}", collection);
 		try (SolrAdminApplication app = new SolrAdminApplication(zkHost)) {
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrApplication.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrApplication.java
@ -1,40 +0,0 @@
 package eu.dnetlib.dhp.oa.provision;
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 public abstract class SolrApplication {
 	private static final Logger log = LoggerFactory.getLogger(SolrApplication.class);
 	protected static final String LAYOUT = "index";
 	protected static final String INTERPRETATION = "openaire";
 	protected static final String SEPARATOR = "-";
 	protected static final String DATE_FORMAT = "yyyy-MM-dd'T'hh:mm:ss'Z'";
 	/**
 	 * Method retrieves from the information system the zookeeper quorum of the Solr server
 	 *
 	 * @param isLookup
 	 * @return the zookeeper quorum of the Solr server
 	 * @throws ISLookUpException
 	 */
 	protected static String getZkHost(ISLookUpService isLookup) throws ISLookUpException {
 		return doLookup(
 			isLookup,
 			"for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()");
 	}
 	protected static String doLookup(ISLookUpService isLookup, String xquery) throws ISLookUpException {
 		log.info(String.format("running xquery: %s", xquery));
 		final String res = isLookup.getResourceProfileByQuery(xquery);
 		log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ..."));
 		return res;
 	}
 }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
@ -1,8 +1,31 @@
 package eu.dnetlib.dhp.oa.provision;
-import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import com.lucidworks.spark.util.SolrSupport;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
 import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
 import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerException;
 import javax.xml.transform.stream.StreamResult;
 import javax.xml.transform.stream.StreamSource;
 import java.io.IOException;
 import java.io.StringReader;
 import java.io.StringWriter;
@ -10,37 +33,26 @@ import java.text.SimpleDateFormat;
 import java.util.Date;
 import java.util.Optional;
-import javax.xml.transform.Transformer;
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import javax.xml.transform.TransformerException;
 import javax.xml.transform.stream.StreamResult;
 import javax.xml.transform.stream.StreamSource;
-import org.apache.commons.io.IOUtils;
+public class XmlIndexingJob {
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.rdd.RDD;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.lucidworks.spark.util.SolrSupport;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 public class XmlIndexingJob extends SolrApplication {
 	private static final Logger log = LoggerFactory.getLogger(XmlIndexingJob.class);
 	private static final Integer DEFAULT_BATCH_SIZE = 1000;
 	protected static final String DATE_FORMAT = "yyyy-MM-dd'T'hh:mm:ss'Z'";
 	private String inputPath;
 	private String format;
 	private int batchSize;
 	private String outputPath;
 	private SparkSession spark;
 	public static void main(String[] args) throws Exception {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -60,27 +72,50 @@ public class XmlIndexingJob extends SolrApplication {
 		final String inputPath = parser.get("inputPath");
 		log.info("inputPath: {}", inputPath);
 		final String isLookupUrl = parser.get("isLookupUrl");
 		log.info("isLookupUrl: {}", isLookupUrl);
 		final String format = parser.get("format");
 		log.info("format: {}", format);
 		final String outputPath = Optional.ofNullable(parser.get("outputPath"))
 				.orElse(null);
 		log.info("outputPath: {}", outputPath);
 		final Integer batchSize = parser.getObjectMap().containsKey("batchSize")
 			? Integer.valueOf(parser.get("batchSize"))
 			: DEFAULT_BATCH_SIZE;
 		log.info("batchSize: {}", batchSize);
-		final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
+		final SparkConf conf = new SparkConf();
-		final String fields = getLayoutSource(isLookup, format);
+		conf.registerKryoClasses(new Class[] { SerializableSolrInputDocument.class });
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
 				final String isLookupUrl = parser.get("isLookupUrl");
 				log.info("isLookupUrl: {}", isLookupUrl);
 				final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
 				new XmlIndexingJob(spark, inputPath, format, batchSize, outputPath).run(isLookup);
 			});
 	}
 	public XmlIndexingJob(SparkSession spark, String inputPath, String format, Integer batchSize, String outputPath) {
 		this.spark = spark;
 		this.inputPath = inputPath;
 		this.format = format;
 		this.batchSize = batchSize;
 		this.outputPath = outputPath;
 	}
 	public void run(ISLookupClient isLookup) throws ISLookUpException, TransformerException {
 		final String fields = isLookup.getLayoutSource(format);
 		log.info("fields: {}", fields);
-		final String xslt = getLayoutTransformer(isLookup);
+		final String xslt = isLookup.getLayoutTransformer();
-		final String dsId = getDsId(format, isLookup);
+		final String dsId = isLookup.getDsId(format);
 		log.info("dsId: {}", dsId);
-		final String zkHost = getZkHost(isLookup);
+		final String zkHost = isLookup.getZkHost();
 		log.info("zkHost: {}", zkHost);
 		final String version = getRecordDatestamp();
@ -88,24 +123,25 @@ public class XmlIndexingJob extends SolrApplication {
 		final String indexRecordXslt = getLayoutTransformer(format, fields, xslt);
 		log.info("indexRecordTransformer {}", indexRecordXslt);
-		final SparkConf conf = new SparkConf();
+		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
-		runWithSparkSession(
+		JavaRDD<SolrInputDocument> docs = sc
-			conf,
+				.sequenceFile(inputPath, Text.class, Text.class)
-			isSparkSessionManaged,
+				.map(t -> t._2().toString())
-			spark -> {
+				.map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s))
-				final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+				.map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s));
-				RDD<SolrInputDocument> docs = sc
+		if (StringUtils.isNotBlank(outputPath)) {
-					.sequenceFile(inputPath, Text.class, Text.class)
+			spark.createDataset(
-					.map(t -> t._2().toString())
+					docs.map(s -> new SerializableSolrInputDocument(s)).rdd(),
-					.map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s))
+					Encoders.kryo(SerializableSolrInputDocument.class))
-					.map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s))
+				.write()
-					.rdd();
+				.mode(SaveMode.Overwrite)
-
+				.parquet(outputPath);
-				final String collection = format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
+		} else {
-				SolrSupport.indexDocs(zkHost, collection, batchSize, docs);
+			final String collection = ProvisionConstants.getCollectionName(format);
-			});
+			SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
 		}
 	}
 	protected static String toIndexRecord(Transformer tr, final String record) {
@ -151,56 +187,4 @@ public class XmlIndexingJob extends SolrApplication {
 		return new SimpleDateFormat(DATE_FORMAT).format(new Date());
 	}
 	/**
 	 * Method retrieves from the information system the list of fields associated to the given MDFormat name
 	 *
 	 * @param isLookup the ISLookup service stub
 	 * @param format the Metadata format name
 	 * @return the string representation of the list of fields to be indexed
 	 * @throws ISLookUpDocumentNotFoundException
 	 * @throws ISLookUpException
 	 */
 	private static String getLayoutSource(final ISLookUpService isLookup, final String format)
 		throws ISLookUpDocumentNotFoundException, ISLookUpException {
 		return doLookup(
 			isLookup,
 			String
 				.format(
 					"collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']",
 					format, LAYOUT));
 	}
 	/**
 	 * Method retrieves from the information system the openaireLayoutToRecordStylesheet
 	 *
 	 * @param isLookup the ISLookup service stub
 	 * @return the string representation of the XSLT contained in the transformation rule profile
 	 * @throws ISLookUpDocumentNotFoundException
 	 * @throws ISLookUpException
 	 */
 	private static String getLayoutTransformer(ISLookUpService isLookup) throws ISLookUpException {
 		return doLookup(
 			isLookup,
 			"collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')"
 				+ "//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()");
 	}
 	/**
 	 * Method retrieves from the information system the IndexDS profile ID associated to the given MDFormat name
 	 *
 	 * @param format
 	 * @param isLookup
 	 * @return the IndexDS identifier
 	 * @throws ISLookUpException
 	 */
 	private static String getDsId(String format, ISLookUpService isLookup) throws ISLookUpException {
 		return doLookup(
 			isLookup,
 			String
 				.format(
 					"collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')"
 						+ "//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()",
 					format));
 	}
 }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SerializableSolrInputDocument.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SerializableSolrInputDocument.java
@ -0,0 +1,22 @@
 package eu.dnetlib.dhp.oa.provision.model;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.SolrInputField;
 import java.util.HashMap;
 import java.util.Map;
 /**
 * Wrapper class needed to make the SolrInputDocument compatible with the Kryo serialization mechanism.
 */
 public class SerializableSolrInputDocument extends SolrInputDocument {
    public SerializableSolrInputDocument() {
        super(new HashMap<>());
    }
    public SerializableSolrInputDocument(Map<String, SolrInputField> fields) {
        super(fields);
    }
 }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ISLookupClient.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ISLookupClient.java
@ -0,0 +1,95 @@
 package eu.dnetlib.dhp.oa.provision.utils;
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import eu.dnetlib.dhp.oa.provision.ProvisionConstants;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 public class ISLookupClient {
 	private static final Logger log = LoggerFactory.getLogger(ISLookupClient.class);
 	private ISLookUpService isLookup;
 	public ISLookupClient(ISLookUpService isLookup) {
 		this.isLookup = isLookup;
 	}
 	/**
 	 * Method retrieves from the information system the list of fields associated to the given MDFormat name
 	 *
 	 * @param format the Metadata format name
 	 * @return the string representation of the list of fields to be indexed
 	 * @throws ISLookUpDocumentNotFoundException
 	 * @throws ISLookUpException
 	 */
 	public String getLayoutSource(final String format)
 		throws ISLookUpDocumentNotFoundException, ISLookUpException {
 		return doLookup(
 			String
 				.format(
 					"collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']",
 					format, ProvisionConstants.LAYOUT));
 	}
 	/**
 	 * Method retrieves from the information system the openaireLayoutToRecordStylesheet
 	 *
 	 * @return the string representation of the XSLT contained in the transformation rule profile
 	 * @throws ISLookUpDocumentNotFoundException
 	 * @throws ISLookUpException
 	 */
 	public String getLayoutTransformer() throws ISLookUpException {
 		return doLookup(
 			"collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')"
 				+ "//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()");
 	}
 	/**
 	 * Method retrieves from the information system the IndexDS profile ID associated to the given MDFormat name
 	 *
 	 * @param format
 	 * @return the IndexDS identifier
 	 * @throws ISLookUpException
 	 */
 	public String getDsId(String format) throws ISLookUpException {
 		return doLookup(
 			String
 				.format(
 					"collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')"
 						+ "//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()",
 					format));
 	}
 	/**
 	 * Method retrieves from the information system the zookeeper quorum of the Solr server
 	 *
 	 * @return the zookeeper quorum of the Solr server
 	 * @throws ISLookUpException
 	 */
 	public String getZkHost() throws ISLookUpException {
 		return doLookup(
 			"for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()");
 	}
 	private String doLookup(String xquery) throws ISLookUpException {
 		log.info(String.format("running xquery: %s", xquery));
 		final String res = getIsLookup().getResourceProfileByQuery(xquery);
 		log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ..."));
 		return res;
 	}
 	public ISLookUpService getIsLookup() {
 		return isLookup;
 	}
 	public void setIsLookup(ISLookUpService isLookup) {
 		this.isLookup = isLookup;
 	}
 }
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java
@ -46,11 +46,6 @@ public class StreamingInputDocumentFactory {
 	private static final String INDEX_RECORD_ID = INDEX_FIELD_PREFIX + "indexrecordidentifier";
 	private static final String outFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'";
 	private static final List<String> dateFormats = Arrays
 		.asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy");
 	private static final String DEFAULTDNETRESULT = "dnetResult";
 	private static final String TARGETFIELDS = "targetFields";
@ -125,13 +120,12 @@ public class StreamingInputDocumentFactory {
 			}
 			if (!indexDocument.containsKey(INDEX_RECORD_ID)) {
-				indexDocument.clear();
+				throw new IllegalStateException("cannot extract record ID from: " + inputDocument);
 				System.err.println("missing indexrecord id:\n" + inputDocument);
 			}
 			return indexDocument;
 		} catch (XMLStreamException e) {
-			return new SolrInputDocument();
+			throw new IllegalStateException(e);
 		}
 	}
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_update_index.json
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_params_update_index.json
@ -22,5 +22,11 @@
    "paramLongName": "batchSize",
    "paramDescription": "size of the batch of documents sent to solr",
    "paramRequired": false
  },
  {
    "paramName": "o",
    "paramLongName": "outputPath",
    "paramDescription": "path on hdfs activating an alternative output for the SolrInputDocuments",
    "paramRequired": false
  }
 ]
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@ -638,6 +638,7 @@
            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
            <arg>--format</arg><arg>${format}</arg>
            <arg>--batchSize</arg><arg>${batchSize}</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
        </spark>
        <ok to="commit_solr_collection"/>
        <error to="Kill"/>
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplicationTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplicationTest.java
@ -1,107 +1,18 @@
 package eu.dnetlib.dhp.oa.provision;
 import java.io.File;
 import java.nio.file.Path;
 import org.apache.solr.client.solrj.SolrResponse;
 import org.apache.solr.client.solrj.embedded.JettyConfig;
 import org.apache.solr.client.solrj.impl.CloudSolrClient;
 import org.apache.solr.client.solrj.impl.XMLResponseParser;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.client.solrj.request.ConfigSetAdminRequest;
 import org.apache.solr.client.solrj.request.QueryRequest;
 import org.apache.solr.client.solrj.request.RequestWriter;
 import org.apache.solr.client.solrj.response.CollectionAdminResponse;
 import org.apache.solr.client.solrj.response.ConfigSetAdminResponse;
 import org.apache.solr.client.solrj.response.SolrPingResponse;
 import org.apache.solr.client.solrj.response.UpdateResponse;
 import org.apache.solr.cloud.MiniSolrCloudCluster;
 import org.apache.solr.common.params.CollectionParams;
 import org.apache.solr.common.params.CoreAdminParams;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.util.NamedList;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.io.TempDir;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import junit.framework.Assert;
-public class SolrAdminApplicationTest {
+public class SolrAdminApplicationTest extends SolrTest {
 	private static final Logger log = LoggerFactory.getLogger(SolrAdminApplicationTest.class);
 	public static final String DEFAULT_COLLECTION = "testCollection";
 	public static final String CONFIG_NAME = "testConfig";
 	private static MiniSolrCloudCluster miniCluster;
 	private static CloudSolrClient cloudSolrClient;
 	@TempDir
 	public static Path tempDir;
 	@BeforeAll
 	public static void setup() throws Exception {
 		// random unassigned HTTP port
 		final int jettyPort = 0;
 		final JettyConfig jettyConfig = JettyConfig.builder().setPort(jettyPort).build();
 		// create a MiniSolrCloudCluster instance
 		miniCluster = new MiniSolrCloudCluster(2, tempDir, jettyConfig);
 		// Upload Solr configuration directory to ZooKeeper
 		String solrZKConfigDir = "src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig";
 		File configDir = new File(solrZKConfigDir);
 		miniCluster.uploadConfigSet(configDir.toPath(), CONFIG_NAME);
 		// override settings in the solrconfig include
 		System.setProperty("solr.tests.maxBufferedDocs", "100000");
 		System.setProperty("solr.tests.maxIndexingThreads", "-1");
 		System.setProperty("solr.tests.ramBufferSizeMB", "100");
 		// use non-test classes so RandomizedRunner isn't necessary
 		System.setProperty("solr.tests.mergeScheduler", "org.apache.lucene.index.ConcurrentMergeScheduler");
 		System.setProperty("solr.directoryFactory", "solr.RAMDirectoryFactory");
 		cloudSolrClient = miniCluster.getSolrClient();
 		cloudSolrClient.setRequestWriter(new RequestWriter());
 		cloudSolrClient.setParser(new XMLResponseParser());
 		cloudSolrClient.setDefaultCollection(DEFAULT_COLLECTION);
 		cloudSolrClient.connect();
 		log.info(new ConfigSetAdminRequest.List().process(cloudSolrClient).toString());
 		log.info(CollectionAdminRequest.ClusterStatus.getClusterStatus().process(cloudSolrClient).toString());
 		createCollection(cloudSolrClient, DEFAULT_COLLECTION, 2, 1, CONFIG_NAME);
 	}
 	@AfterAll
 	public static void shutDown() throws Exception {
 		miniCluster.shutdown();
 	}
 	protected static NamedList<Object> createCollection(CloudSolrClient client, String name, int numShards,
 		int replicationFactor, String configName) throws Exception {
 		ModifiableSolrParams modParams = new ModifiableSolrParams();
 		modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name());
 		modParams.set("name", name);
 		modParams.set("numShards", numShards);
 		modParams.set("replicationFactor", replicationFactor);
 		modParams.set("collection.configName", configName);
 		QueryRequest request = new QueryRequest(modParams);
 		request.setPath("/admin/collections");
 		return client.request(request);
 	}
 	@Test
 	public void testPing() throws Exception {
-		SolrPingResponse pingResponse = cloudSolrClient.ping();
+		SolrPingResponse pingResponse = miniCluster.getSolrClient().ping();
 		log.info("pingResponse: '{}'", pingResponse.getStatus());
 		Assert.assertTrue(pingResponse.getStatus() == 0);
 	}
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrTest.java
@ -0,0 +1,109 @@
 package eu.dnetlib.dhp.oa.provision;
 import java.io.File;
 import java.nio.file.Path;
 import org.apache.commons.io.FileUtils;
 import org.apache.solr.client.solrj.embedded.JettyConfig;
 import org.apache.solr.client.solrj.impl.CloudSolrClient;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.client.solrj.request.ConfigSetAdminRequest;
 import org.apache.solr.client.solrj.request.QueryRequest;
 import org.apache.solr.cloud.MiniSolrCloudCluster;
 import org.apache.solr.common.params.CollectionParams;
 import org.apache.solr.common.params.CoreAdminParams;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.util.NamedList;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.io.TempDir;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 public abstract class SolrTest {
 	protected static final Logger log = LoggerFactory.getLogger(SolrTest.class);
 	protected static final String FORMAT = "test";
 	protected static final String DEFAULT_COLLECTION = FORMAT + "-index-openaire";
 	protected static final String CONFIG_NAME = "testConfig";
 	protected static MiniSolrCloudCluster miniCluster;
 	@TempDir
 	public static Path workingDir;
 	@BeforeAll
 	public static void setup() throws Exception {
 		// random unassigned HTTP port
 		final int jettyPort = 0;
 		final JettyConfig jettyConfig = JettyConfig.builder().setPort(jettyPort).build();
 		log.info(String.format("working directory: %s", workingDir.toString()));
 		System.setProperty("solr.log.dir", workingDir.resolve("logs").toString());
 		// create a MiniSolrCloudCluster instance
 		miniCluster = new MiniSolrCloudCluster(2, workingDir.resolve("solr"), jettyConfig);
 		// Upload Solr configuration directory to ZooKeeper
 		String solrZKConfigDir = "src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig";
 		File configDir = new File(solrZKConfigDir);
 		miniCluster.uploadConfigSet(configDir.toPath(), CONFIG_NAME);
 		// override settings in the solrconfig include
 		System.setProperty("solr.tests.maxBufferedDocs", "100000");
 		System.setProperty("solr.tests.maxIndexingThreads", "-1");
 		System.setProperty("solr.tests.ramBufferSizeMB", "100");
 		// use non-test classes so RandomizedRunner isn't necessary
 		System.setProperty("solr.tests.mergeScheduler", "org.apache.lucene.index.ConcurrentMergeScheduler");
 		System.setProperty("solr.directoryFactory", "solr.RAMDirectoryFactory");
 		System.setProperty("solr.lock.type", "single");
 		log.info(new ConfigSetAdminRequest.List().process(miniCluster.getSolrClient()).toString());
 		log
 			.info(
 				CollectionAdminRequest.ClusterStatus
 					.getClusterStatus()
 					.process(miniCluster.getSolrClient())
 					.toString());
 		NamedList<Object> res = createCollection(
 			miniCluster.getSolrClient(), DEFAULT_COLLECTION, 4, 2, 20, CONFIG_NAME);
 		res.forEach(o -> log.info(o.toString()));
 		miniCluster.getSolrClient().setDefaultCollection(DEFAULT_COLLECTION);
 		log
 			.info(
 				CollectionAdminRequest.ClusterStatus
 					.getClusterStatus()
 					.process(miniCluster.getSolrClient())
 					.toString());
 	}
 	@AfterAll
 	public static void shutDown() throws Exception {
 		miniCluster.shutdown();
 		FileUtils.deleteDirectory(workingDir.toFile());
 	}
 	protected static NamedList<Object> createCollection(CloudSolrClient client, String name, int numShards,
 		int replicationFactor, int maxShardsPerNode, String configName) throws Exception {
 		ModifiableSolrParams modParams = new ModifiableSolrParams();
 		modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name());
 		modParams.set("name", name);
 		modParams.set("numShards", numShards);
 		modParams.set("replicationFactor", replicationFactor);
 		modParams.set("collection.configName", configName);
 		modParams.set("maxShardsPerNode", maxShardsPerNode);
 		QueryRequest request = new QueryRequest(modParams);
 		request.setPath("/admin/collections");
 		return client.request(request);
 	}
 }
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJobTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJobTest.java
@ -0,0 +1,140 @@
 package eu.dnetlib.dhp.oa.provision;
 import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
 import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.common.SolrInputField;
 import org.apache.solr.common.params.CommonParams;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import org.dom4j.io.SAXReader;
 import org.junit.jupiter.api.*;
 import org.junit.jupiter.api.extension.ExtendWith;
 import org.mockito.Mock;
 import org.mockito.Mockito;
 import org.mockito.junit.jupiter.MockitoExtension;
 import java.io.IOException;
 import java.io.StringReader;
 import java.net.URI;
@ExtendWith(MockitoExtension.class)
 public class XmlIndexingJobTest extends SolrTest {
 	protected static SparkSession spark;
 	private static final Integer batchSize = 100;
 	@Mock
 	private ISLookUpService isLookUpService;
 	@Mock
 	private ISLookupClient isLookupClient;
 	@BeforeEach
 	public void prepareMocks() throws ISLookUpException, IOException {
 		isLookupClient.setIsLookup(isLookUpService);
 		int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort();
 		Mockito
 			.when(isLookupClient.getDsId(Mockito.anyString()))
 			.thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
 		Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort));
 		Mockito
 			.when(isLookupClient.getLayoutSource(Mockito.anyString()))
 			.thenReturn(IOUtils.toString(getClass().getResourceAsStream("fields.xml")));
 		Mockito
 			.when(isLookupClient.getLayoutTransformer())
 			.thenReturn(IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl")));
 	}
 	@BeforeAll
 	public static void before() {
 		SparkConf conf = new SparkConf();
 		conf.setAppName(XmlIndexingJobTest.class.getSimpleName());
 		conf.registerKryoClasses(new Class[] { SerializableSolrInputDocument.class });
 		conf.setMaster("local[1]");
 		conf.set("spark.driver.host", "localhost");
 		conf.set("hive.metastore.local", "true");
 		conf.set("spark.ui.enabled", "false");
 		conf.set("spark.sql.warehouse.dir", workingDir.resolve("spark").toString());
 		spark = SparkSession
 			.builder()
 			.appName(XmlIndexingJobTest.class.getSimpleName())
 			.config(conf)
 			.getOrCreate();
 	}
 	@AfterAll
 	public static void tearDown() {
 		spark.stop();
 	}
 	@Test
 	public void testXmlIndexingJob_onSolr() throws Exception {
 		String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml";
 		long nRecord = JavaSparkContext
 			.fromSparkContext(spark.sparkContext())
 			.sequenceFile(inputPath, Text.class, Text.class)
 			.count();
 		new XmlIndexingJob(spark, inputPath, FORMAT, batchSize, null).run(isLookupClient);
 		Assertions.assertEquals(0, miniCluster.getSolrClient().commit().getStatus());
 		QueryResponse rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "*:*"));
 		Assertions.assertEquals(nRecord, rsp.getResults().getNumFound(),
 				"the number of indexed records should be equal to the number of input records");
 	}
 	@Test
 	public void testXmlIndexingJob_saveOnHDFS() throws Exception {
 		final String ID_XPATH = "//header/*[local-name()='objIdentifier']";
 		String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml";
 		final JavaPairRDD<Text, Text> xmlRecords = JavaSparkContext
 				.fromSparkContext(spark.sparkContext())
 				.sequenceFile(inputPath, Text.class, Text.class);
 		long nRecord = xmlRecords.count();
 		long xmlIdUnique = xmlRecords
 				.map(t -> t._2().toString())
 				.map(s -> new SAXReader().read(new StringReader(s)).valueOf(ID_XPATH))
 				.distinct().count();
 		Assertions.assertEquals(nRecord, xmlIdUnique, "IDs should be unique among input records");
 		final String outputPath = workingDir.resolve("outputPath").toAbsolutePath().toString();
 		new XmlIndexingJob(spark, inputPath, FORMAT, batchSize, outputPath).run(isLookupClient);
 		final Dataset<SerializableSolrInputDocument> solrDocs = spark.read()
 				.load(outputPath)
 				.as(Encoders.kryo(SerializableSolrInputDocument.class));
 		long docIdUnique = solrDocs.map((MapFunction<SerializableSolrInputDocument, String>) doc -> {
 			final SolrInputField id = doc.getField("__indexrecordidentifier");
 			return id.getFirstValue().toString();
 				}, Encoders.STRING())
 				.distinct()
 				.count();
 		Assertions.assertEquals(xmlIdUnique, docIdUnique, "IDs should be unique among the output records");
 	}
 }
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml
@ -105,7 +105,7 @@
        <FIELD indexable="true" name="relorganizationname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalname)"/>
        <FIELD indexable="true" name="relorganizationshortname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalshortname)"/>
        <FIELD indexable="true" name="relresultid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='result'])"/>
-        <FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/resulttype/@classid)"/>
+        <FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@type)"/>
        <FIELD indexable="true" name="relclass" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@class)"/>
        <FIELD indexable="true" name="relfundinglevel0_id" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']//rel/funding/funding_level_0"/>
        <FIELD indexable="true" name="relfundinglevel0_name" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']//rel/funding/funding_level_0/@name/string()"/>
@ -123,7 +123,8 @@
        <FIELD indexable="true" name="relfundername" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']//rel/funding/funder/@name)"/>
        <FIELD indexable="true" name="relfunderjurisdiction" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']//rel/funding/funder/@jurisdiction)"/><!-- Collected from of the related entity. Available for result-result relationships -->
        <FIELD indexable="true" name="relcollectedfromid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/collectedfrom/@id)"/>
-        <FIELD indexable="true" name="relcollectedfromname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/collectedfrom/@name)"/><!-- COMMON FIELDS -->
+        <FIELD indexable="true" name="relcollectedfromname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/collectedfrom/@name)"/>
        <FIELD indexable="true" name="semrelid" result="false" stat="false" tokenizable="false" value="concat(./to/text(), '||', ./to/@class/string())" xpath="//*[local-name()='entity']//rel"/><!-- COMMON FIELDS -->
        <FIELD indexable="true" multivalued="false" name="dateofcollection" result="false" stat="false" type="pdate" value="//header/*[local-name()='dateOfCollection']"/>
        <FIELD indexable="true" name="collectedfrom" result="false" stat="false" tokenizable="false" value="distinct-values(concat(./@id, '||', ./@name))" xpath="//*[local-name()='entity']/*/*[local-name()='collectedfrom'] | //*[local-name()='entity']/*//*[local-name() = 'instance']/*[local-name()='collectedfrom']"/>
        <FIELD indexable="true" name="collectedfromdatasourceid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/*[local-name()='collectedfrom']/@id | //*[local-name()='entity']/*//*[local-name() = 'instance']/*[local-name()='collectedfrom']/@id)"/>
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/elevate.xml
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/elevate.xml
@ -0,0 +1,31 @@
 Unless required by applicable law or agreed to in writing, software
        distributed under the License is distributed on an "AS IS" BASIS,
        WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
        See the License for the specific language governing permissions and
        limitations under the License.
        -->
        <!-- If this file is found in the config directory, it will only be
             loaded once at startup.  If it is found in Solr's data
             directory, it will be re-loaded every commit.
           See http://wiki.apache.org/solr/QueryElevationComponent for more info
        -->
 <elevate>
    <!-- Query elevation examples
     <query text="foo bar">
       <doc id="1" />
       <doc id="2" />
       <doc id="3" />
     </query>
   for use with techproducts example
     <query text="ipod">
       <doc id="MA147LL/A" />  put the actual ipod at the top
       <doc id="IW-02" exclude="true" /> exclude this cable
     </query>
   -->
 </elevate>
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/managed-schema
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/managed-schema
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/solrconfig.xml
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/testConfig/solrconfig.xml
@ -83,6 +83,7 @@
  <lib dir="${solr.install.dir:../../../..}/contrib/velocity/lib" regex=".*\.jar" />
  <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-velocity-\d.*\.jar" />
  <!-- an exact 'path' can be used instead of a 'dir' to specify a
       specific jar file.  This will cause a serious error to be logged
       if it can't be loaded.
@ -112,7 +113,8 @@
       One can force a particular implementation via solr.MMapDirectoryFactory,
       solr.NIOFSDirectoryFactory, or solr.SimpleFSDirectoryFactory.
-       solr.RAMDirectoryFactory is memory based and not persistent.
+       solr.RAMDirectoryFactory is memory based, not
       persistent, and doesn't work with replication.
    -->
  <directoryFactory name="DirectoryFactory"
                    class="${solr.directoryFactory:solr.NRTCachingDirectoryFactory}"/>
@ -204,7 +206,7 @@
         More details on the nuances of each LockFactory...
         http://wiki.apache.org/lucene-java/AvailableLockFactories
    -->
-    <lockType>${solr.lock.type:single}</lockType>
+    <lockType>${solr.lock.type:native}</lockType>
    <!-- Commit Deletion Policy
         Custom deletion policies can be specified here. The class must
@ -331,6 +333,29 @@
         postCommit - fired after every commit or optimize command
         postOptimize - fired after every optimize command
      -->
    <!-- The RunExecutableListener executes an external command from a
         hook such as postCommit or postOptimize.
         exe - the name of the executable to run
         dir - dir to use as the current working directory. (default=".")
         wait - the calling thread waits until the executable returns.
                (default="true")
         args - the arguments to pass to the program.  (default is none)
         env - environment variables to set.  (default is none)
      -->
    <!-- This example shows how RunExecutableListener could be used
         with the script based replication...
         http://wiki.apache.org/solr/CollectionDistribution
      -->
    <!--
       <listener event="postCommit" class="solr.RunExecutableListener">
         <str name="exe">solr/bin/snapshooter</str>
         <str name="dir">.</str>
         <bool name="wait">true</bool>
         <arr name="args"> <str>arg1</str> <str>arg2</str> </arr>
         <arr name="env"> <str>MYVAR=val1</str> </arr>
       </listener>
      -->
  </updateHandler>
@ -366,14 +391,22 @@
       Query section - these settings control query time things like caches
       ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
  <query>
    <!-- Max Boolean Clauses
         Maximum number of clauses in each BooleanQuery,  an exception
         is thrown if exceeded.
         ** WARNING **
         This option actually modifies a global Lucene property that
         will affect all SolrCores.  If multiple solrconfig.xml files
         disagree on this property, the value at any given moment will
         be based on the last SolrCore to be initialized.
    <!-- Maximum number of clauses in each BooleanQuery,  an exception
         is thrown if exceeded.  It is safe to increase or remove this setting,
         since it is purely an arbitrary limit to try and catch user errors where
         large boolean queries may not be the best implementation choice.
      -->
    <maxBooleanClauses>1024</maxBooleanClauses>
    <!-- Solr Internal Query Caches
         There are two implementations of cache available for Solr,
@ -575,8 +608,21 @@
       This section contains instructions for how the SolrDispatchFilter
       should behave when processing requests for this SolrCore.
       handleSelect is a legacy option that affects the behavior of requests
       such as /select?qt=XXX
       handleSelect="true" will cause the SolrDispatchFilter to process
       the request and dispatch the query to a handler specified by the
       "qt" param, assuming "/select" isn't already registered.
       handleSelect="false" will cause the SolrDispatchFilter to
       ignore "/select" requests, resulting in a 404 unless a handler
       is explicitly registered with the name "/select"
       handleSelect="true" is not recommended for new users, but is the default
       for backwards compatibility
    -->
-  <requestDispatcher>
+  <requestDispatcher handleSelect="false" >
    <!-- Request Parsing
         These settings indicate how Solr Requests may be parsed, and
@ -602,14 +648,15 @@
         plugins.
         *** WARNING ***
-         Before enabling remote streaming, you should make sure your
+         The settings below authorize Solr to fetch remote files, You
-         system has authentication enabled.
+         should make sure your system has some authentication before
         using enableRemoteStreaming="true"
    <requestParsers enableRemoteStreaming="false"
                    multipartUploadLimitInKB="-1"
                    formdataUploadLimitInKB="-1"
                    addHttpRequestToContext="false"/>
      -->
    <requestParsers enableRemoteStreaming="true"
                    multipartUploadLimitInKB="2048000"
                    formdataUploadLimitInKB="2048"
                    addHttpRequestToContext="false"/>
    <!-- HTTP Caching
@ -673,6 +720,14 @@
       Incoming queries will be dispatched to a specific handler by name
       based on the path specified in the request.
       Legacy behavior: If the request path uses "/select" but no Request
       Handler has that name, and if handleSelect="true" has been specified in
       the requestDispatcher, then the Request Handler is dispatched based on
       the qt parameter.  Handlers without a leading '/' are accessed this way
       like so: http://host/app/[core/]select?qt=name  If no qt is
       given, then the requestHandler that declares default="true" will be
       used or the one named "standard".
       If a Request Handler is declared with startup="lazy", then it will
       not be initialized until the first request that uses it.
@ -692,13 +747,9 @@
      -->
    <lst name="defaults">
      <str name="echoParams">explicit</str>
      <str name="q.op">AND</str>
      <int name="rows">10</int>
-      <!-- Default search field
+      <!-- <str name="df">text</str> -->
         <str name="df">text</str> 
        -->
      <!-- Change from JSON to XML format (the default prior to Solr 7.0)
         <str name="wt">xml</str> 
        -->
    </lst>
    <!-- In addition to defaults, "appends" params can be specified
         to identify values which should be appended to the list of
@ -781,10 +832,18 @@
  <initParams path="/update/**,/query,/select,/tvrh,/elevate,/spell,/browse">
    <lst name="defaults">
-      <str name="df">_text_</str>
+      <str name="df">__all</str>
    </lst>
  </initParams>
  <!-- This enabled schemaless mode
  <initParams path="/update/**">
    <lst name="defaults">
      <str name="update.chain">add-unknown-fields-to-the-schema</str>
    </lst>
  </initParams>
  -->
  <!-- Solr Cell Update Request Handler
       http://wiki.apache.org/solr/ExtractingRequestHandler
@ -796,10 +855,9 @@
    <lst name="defaults">
      <str name="lowernames">true</str>
      <str name="fmap.meta">ignored_</str>
-      <str name="fmap.content">_text_</str>
+      <str name="fmap.content">__all</str>
    </lst>
  </requestHandler>
  <!-- Search Components
       Search components are registered to SolrCore and used by
@ -861,7 +919,7 @@
    <!-- a spellchecker built from a field of the main index -->
    <lst name="spellchecker">
      <str name="name">default</str>
-      <str name="field">_text_</str>
+      <str name="field">__all</str>
      <str name="classname">solr.DirectSolrSpellChecker</str>
      <!-- the spellcheck distance measure used, the default is the internal levenshtein -->
      <str name="distanceMeasure">internal</str>
@ -986,6 +1044,7 @@
  <searchComponent name="elevator" class="solr.QueryElevationComponent" >
    <!-- pick a fieldType to analyze queries -->
    <str name="queryFieldType">string</str>
    <str name="config-file">elevate.xml</str>
  </searchComponent>
  <!-- A request handler for demonstrating the elevator component -->
@ -1116,81 +1175,70 @@
  <!-- Add unknown fields to the schema
-       Field type guessing update processors that will
+       An example field type guessing update processor that will
       attempt to parse string-typed field values as Booleans, Longs,
       Doubles, or Dates, and then add schema fields with the guessed
-       field types. Text content will be indexed as "text_general" as
+       field types.
       well as a copy to a plain string version in *_str.
-       These require that the schema is both managed and mutable, by
+       This requires that the schema is both managed and mutable, by
       declaring schemaFactory as ManagedIndexSchemaFactory, with
       mutable specified as true.
       See http://wiki.apache.org/solr/GuessingFieldTypes
    -->
-  <updateProcessor class="solr.UUIDUpdateProcessorFactory" name="uuid"/>
+  <updateRequestProcessorChain name="add-unknown-fields-to-the-schema">
-  <updateProcessor class="solr.RemoveBlankFieldUpdateProcessorFactory" name="remove-blank"/>
+    <!-- UUIDUpdateProcessorFactory will generate an id if none is present in the incoming document -->
-  <updateProcessor class="solr.FieldNameMutatingUpdateProcessorFactory" name="field-name-mutating">
+    <processor class="solr.UUIDUpdateProcessorFactory" />
-    <str name="pattern">[^\w-\.]</str>
+    <processor class="solr.RemoveBlankFieldUpdateProcessorFactory"/>
-    <str name="replacement">_</str>
+    <processor class="solr.FieldNameMutatingUpdateProcessorFactory">
-  </updateProcessor>
+      <str name="pattern">[^\w-\.]</str>
-  <updateProcessor class="solr.ParseBooleanFieldUpdateProcessorFactory" name="parse-boolean"/>
+      <str name="replacement">_</str>
-  <updateProcessor class="solr.ParseLongFieldUpdateProcessorFactory" name="parse-long"/>
+    </processor>
-  <updateProcessor class="solr.ParseDoubleFieldUpdateProcessorFactory" name="parse-double"/>
+    <processor class="solr.ParseBooleanFieldUpdateProcessorFactory"/>
-  <updateProcessor class="solr.ParseDateFieldUpdateProcessorFactory" name="parse-date">
+    <processor class="solr.ParseLongFieldUpdateProcessorFactory"/>
-    <arr name="format">
+    <processor class="solr.ParseDoubleFieldUpdateProcessorFactory"/>
-      <str>yyyy-MM-dd'T'HH:mm:ss.SSSZ</str>
+    <processor class="solr.ParseDateFieldUpdateProcessorFactory">
-      <str>yyyy-MM-dd'T'HH:mm:ss,SSSZ</str>
+      <arr name="format">
-      <str>yyyy-MM-dd'T'HH:mm:ss.SSS</str>
+        <str>yyyy-MM-dd'T'HH:mm:ss.SSSZ</str>
-      <str>yyyy-MM-dd'T'HH:mm:ss,SSS</str>
+        <str>yyyy-MM-dd'T'HH:mm:ss,SSSZ</str>
-      <str>yyyy-MM-dd'T'HH:mm:ssZ</str>
+        <str>yyyy-MM-dd'T'HH:mm:ss.SSS</str>
-      <str>yyyy-MM-dd'T'HH:mm:ss</str>
+        <str>yyyy-MM-dd'T'HH:mm:ss,SSS</str>
-      <str>yyyy-MM-dd'T'HH:mmZ</str>
+        <str>yyyy-MM-dd'T'HH:mm:ssZ</str>
-      <str>yyyy-MM-dd'T'HH:mm</str>
+        <str>yyyy-MM-dd'T'HH:mm:ss</str>
-      <str>yyyy-MM-dd HH:mm:ss.SSSZ</str>
+        <str>yyyy-MM-dd'T'HH:mmZ</str>
-      <str>yyyy-MM-dd HH:mm:ss,SSSZ</str>
+        <str>yyyy-MM-dd'T'HH:mm</str>
-      <str>yyyy-MM-dd HH:mm:ss.SSS</str>
+        <str>yyyy-MM-dd HH:mm:ss.SSSZ</str>
-      <str>yyyy-MM-dd HH:mm:ss,SSS</str>
+        <str>yyyy-MM-dd HH:mm:ss,SSSZ</str>
-      <str>yyyy-MM-dd HH:mm:ssZ</str>
+        <str>yyyy-MM-dd HH:mm:ss.SSS</str>
-      <str>yyyy-MM-dd HH:mm:ss</str>
+        <str>yyyy-MM-dd HH:mm:ss,SSS</str>
-      <str>yyyy-MM-dd HH:mmZ</str>
+        <str>yyyy-MM-dd HH:mm:ssZ</str>
-      <str>yyyy-MM-dd HH:mm</str>
+        <str>yyyy-MM-dd HH:mm:ss</str>
-      <str>yyyy-MM-dd</str>
+        <str>yyyy-MM-dd HH:mmZ</str>
-    </arr>
+        <str>yyyy-MM-dd HH:mm</str>
-  </updateProcessor>
+        <str>yyyy-MM-dd</str>
-  <updateProcessor class="solr.AddSchemaFieldsUpdateProcessorFactory" name="add-schema-fields">
+      </arr>
-    <lst name="typeMapping">
+    </processor>
-      <str name="valueClass">java.lang.String</str>
+    <processor class="solr.AddSchemaFieldsUpdateProcessorFactory">
-      <str name="fieldType">text_general</str>
+      <str name="defaultFieldType">strings</str>
-      <lst name="copyField">
+      <lst name="typeMapping">
-        <str name="dest">*_str</str>
+        <str name="valueClass">java.lang.Boolean</str>
-        <int name="maxChars">256</int>
+        <str name="fieldType">booleans</str>
      </lst>
-      <!-- Use as default mapping instead of defaultFieldType -->
+      <lst name="typeMapping">
-      <bool name="default">true</bool>
+        <str name="valueClass">java.util.Date</str>
-    </lst>
+        <str name="fieldType">tdates</str>
-    <lst name="typeMapping">
+      </lst>
-      <str name="valueClass">java.lang.Boolean</str>
+      <lst name="typeMapping">
-      <str name="fieldType">booleans</str>
+        <str name="valueClass">java.lang.Long</str>
-    </lst>
+        <str name="valueClass">java.lang.Integer</str>
-    <lst name="typeMapping">
+        <str name="fieldType">tlongs</str>
-      <str name="valueClass">java.util.Date</str>
+      </lst>
-      <str name="fieldType">pdates</str>
+      <lst name="typeMapping">
-    </lst>
+        <str name="valueClass">java.lang.Number</str>
-    <lst name="typeMapping">
+        <str name="fieldType">tdoubles</str>
-      <str name="valueClass">java.lang.Long</str>
+      </lst>
-      <str name="valueClass">java.lang.Integer</str>
+    </processor>
      <str name="fieldType">plongs</str>
    </lst>
    <lst name="typeMapping">
      <str name="valueClass">java.lang.Number</str>
      <str name="fieldType">pdoubles</str>
    </lst>
  </updateProcessor>
  <!-- The update.autoCreateFields property can be turned to false to disable schemaless mode -->
  <updateRequestProcessorChain name="add-unknown-fields-to-the-schema" default="${update.autoCreateFields:true}"
           processor="uuid,remove-blank,field-name-mutating,parse-boolean,parse-long,parse-double,parse-date,add-schema-fields">
    <processor class="solr.LogUpdateProcessorFactory"/>
    <processor class="solr.DistributedUpdateProcessorFactory"/>
    <processor class="solr.RunUpdateProcessorFactory"/>
@ -1313,7 +1361,7 @@
  <!-- Query Parsers
-       https://lucene.apache.org/solr/guide/query-syntax-and-parsing.html
+       https://cwiki.apache.org/confluence/display/solr/Query+Syntax+and+Parsing
       Multiple QParserPlugins can be registered by name, and then
       used in either the "defType" param for the QueryComponent (used
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/xml/part-00000
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/xml/part-00000