fixed error on loading files on solr, in cluster is not possible to iterate files inside jar

Merge remote-tracking branch 'origin/beta' into scholix_to_solr
- Implemented oozie workflows and Java Classes to feed into solr index
2022-10-18 10:45:40 +02:00 · 2022-10-11 10:58:53 +02:00 · 2022-10-11 10:58:17 +02:00 · 2022-10-06 09:25:58 +02:00 · 2022-10-06 09:12:14 +02:00 · 2022-10-06 08:49:20 +02:00
27 changed files with 3179 additions and 33 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
@ -3,6 +3,8 @@ package eu.dnetlib.dhp.schema.oaf.utils;

 import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;

+import java.text.ParseException;
+import java.text.SimpleDateFormat;
 import java.time.LocalDate;
 import java.time.ZoneId;
 import java.time.format.DateTimeFormatter;
@ -36,6 +38,14 @@ public class GraphCleaningFunctions extends CleaningFunctions {

 	public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;

+	public static final String BLANK = "";
+
+	private static final String normalizeOutFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'";
+
+	private static final String[] normalizeDateFormats = {
+		"yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy"
+	};
+
 	public static <T extends Oaf> T fixVocabularyNames(T value) {
 		if (value instanceof Datasource) {
 			// nothing to clean here
@ -459,6 +469,20 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 		return Optional.ofNullable(cleanDate(date));
 	}

+	public static String normalizeDate(String s) {
+		final String date = StringUtils.isNotBlank(s) ? s.trim() : BLANK;
+
+		for (String format : normalizeDateFormats) {
+			try {
+				Date parse = new SimpleDateFormat(format).parse(date);
+				String res = new SimpleDateFormat(normalizeOutFormat).format(parse);
+				return res;
+			} catch (ParseException e) {
+			}
+		}
+		return BLANK;
+	}
+
 	public static String cleanDate(final String inputDate) {

 		if (StringUtils.isBlank(inputDate)) {
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java
@ -7,6 +7,7 @@ import java.util.Date;

 import org.apache.commons.lang3.StringUtils;

+import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
 import net.sf.saxon.expr.XPathContext;
 import net.sf.saxon.om.Sequence;
 import net.sf.saxon.trans.XPathException;
@ -14,15 +15,6 @@ import net.sf.saxon.value.SequenceType;
 import net.sf.saxon.value.StringValue;

 public class NormalizeDate extends AbstractExtensionFunction {
-
-	private static final String[] normalizeDateFormats = {
-		"yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy"
-	};
-
-	private static final String normalizeOutFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'";
-
-	public static final String BLANK = "";
-
 	@Override
 	public String getName() {
 		return "normalizeDate";
@ -31,10 +23,10 @@ public class NormalizeDate extends AbstractExtensionFunction {
 	@Override
 	public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
 		if (arguments == null || arguments.length == 0) {
-			return new StringValue(BLANK);
+			return new StringValue(GraphCleaningFunctions.BLANK);
 		}
 		String s = arguments[0].head().getStringValue();
-		return new StringValue(_normalizeDate(s));
+		return new StringValue(GraphCleaningFunctions.normalizeDate(s));
 	}

 	@Override
@ -58,18 +50,4 @@ public class NormalizeDate extends AbstractExtensionFunction {
 	public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
 		return SequenceType.SINGLE_STRING;
 	}
-
-	private String _normalizeDate(String s) {
-		final String date = StringUtils.isNotBlank(s) ? s.trim() : BLANK;
-
-		for (String format : normalizeDateFormats) {
-			try {
-				Date parse = new SimpleDateFormat(format).parse(date);
-				String res = new SimpleDateFormat(normalizeOutFormat).format(parse);
-				return res;
-			} catch (ParseException e) {
-			}
-		}
-		return BLANK;
-	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala
@ -142,6 +142,21 @@ object DataciteToOAFTransformation {
    }
  }

+  /***
+   * Use the vocabulary dnet:publication_resource to find a synonym to one of these terms and get the instance.type.
+   * Using the dnet:result_typologies vocabulary, we look up the instance.type synonym
+   * to generate one of the following main entities:
+   *  - publication
+   *  - dataset
+   *  - software
+   *  otherresearchproduct
+
+   * @param resourceType
+   * @param resourceTypeGeneral
+   * @param schemaOrg
+   * @param vocabularies
+   * @return
+   */
  def getTypeQualifier(
    resourceType: String,
    resourceTypeGeneral: String,
--- a/dhp-workflows/dhp-graph-provision/pom.xml
+++ b/dhp-workflows/dhp-graph-provision/pom.xml
@ -45,6 +45,10 @@
    </build>

    <dependencies>
+        <dependency>
+            <groupId>org.antlr</groupId>
+            <artifactId>stringtemplate</artifactId>
+        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java
@ -23,7 +23,7 @@ public class SolrAdminApplication implements Closeable {
 	private static final Logger log = LoggerFactory.getLogger(SolrAdminApplication.class);

 	enum Action {
-		DELETE_BY_QUERY, COMMIT
+		DELETE_BY_QUERY, COMMIT, CREATE
 	}

 	private final CloudSolrClient solrClient;
@ -56,6 +56,8 @@ public class SolrAdminApplication implements Closeable {

 		final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));

+		final String fields = isLookup.getLayoutSource(format);
+
 		final String zkHost = isLookup.getZkHost();
 		log.info("zkHost: {}", zkHost);

@ -63,7 +65,7 @@ public class SolrAdminApplication implements Closeable {
 		log.info("collection: {}", collection);

 		try (SolrAdminApplication app = new SolrAdminApplication(zkHost)) {
-			app.execute(action, collection, query, commit);
+			app.execute(action, collection, query, commit, fields);
 		}
 	}

@ -73,10 +75,10 @@ public class SolrAdminApplication implements Closeable {
 	}

 	public SolrResponse commit(String collection) throws IOException, SolrServerException {
-		return execute(Action.COMMIT, collection, null, true);
+		return execute(Action.COMMIT, collection, null, true, null);
 	}

-	public SolrResponse execute(Action action, String collection, String query, boolean commit)
+	public SolrResponse execute(Action action, String collection, String query, boolean commit, final String fields)
 		throws IOException, SolrServerException {
 		switch (action) {

@ -88,6 +90,12 @@ public class SolrAdminApplication implements Closeable {
 				return rsp;
 			case COMMIT:
 				return solrClient.commit(collection);
+			case CREATE:
+				SolrUtil
+					.uploadZookeperConfig(this.solrClient.getZkStateReader().getZkClient(), collection, true, fields);
+				SolrUtil.createCollection(this.solrClient, collection, 48, 1, 12, collection);
+				return null;
+
 			default:
 				throw new IllegalArgumentException("action not managed: " + action);
 		}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrUtil.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrUtil.java
@ -0,0 +1,245 @@
+
+package eu.dnetlib.dhp.oa.provision;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Objects;
+
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.impl.CloudSolrClient;
+import org.apache.solr.client.solrj.request.QueryRequest;
+import org.apache.solr.common.cloud.SolrZkClient;
+import org.apache.solr.common.params.CollectionParams;
+import org.apache.solr.common.params.CoreAdminParams;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.util.NamedList;
+import org.apache.zookeeper.CreateMode;
+import org.apache.zookeeper.KeeperException;
+import org.dom4j.Document;
+import org.dom4j.io.DocumentResult;
+import org.dom4j.io.DocumentSource;
+import org.dom4j.io.SAXReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.stringtemplate.v4.ST;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.type.MapType;
+import com.fasterxml.jackson.databind.type.TypeFactory;
+
+public class SolrUtil {
+
+	/**
+	 * The log.
+	 */
+	private static final Logger log = LoggerFactory.getLogger(SolrUtil.class);
+
+	/**
+	 * The Constant CONFIGS_PATH.
+	 */
+	private static final String CONFIGS_PATH = "/configs";
+
+	private static final char DELIMITER = '$';
+
+	public static final String CONF_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/";
+
+	// public static final String CONF_FILE_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/files/";
+
+	public static final String LIST_FILE_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/files/file_list";
+
+	private static final String SCHEMA_TEMPLATE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/schemaTemplate.xslt";
+
+	private static String createURLRequest = "http://%s:%s/solr/admin/collections?action=CREATE&name=%s&numShards=%s&replicationFactor=%s&maxShardsPerNode=%s&collection.configName=%s";
+
+	private static String generateCreateIndexRequest(final String host,
+		final String port,
+		final String collectionName,
+		final String numShard,
+		final String replicationFactor,
+		final String collectionConfigName,
+		final String maxShardsPerNode) {
+		return String
+			.format(
+				createURLRequest, host, port, collectionName, numShard, replicationFactor, maxShardsPerNode,
+				collectionConfigName);
+	}
+
+	public static boolean createSolrIndex(final String host,
+		final String port,
+		final String collectionName,
+		final String numShard,
+		final String replicationFactor,
+		final String maxShardsPerNode,
+		final String collectionConfigName) throws Exception {
+
+		final String uri = generateCreateIndexRequest(
+			host, port, collectionName, numShard, replicationFactor, maxShardsPerNode, collectionConfigName);
+
+		URL url = new URL(uri);
+		System.out.println(uri);
+
+		HttpURLConnection connection = (HttpURLConnection) url.openConnection();
+		connection.setRequestMethod("GET");
+		int status = connection.getResponseCode();
+		System.out.println("status = " + status);
+
+		BufferedReader in = new BufferedReader(
+			new InputStreamReader(connection.getInputStream()));
+		String inputLine;
+		StringBuffer content = new StringBuffer();
+		while ((inputLine = in.readLine()) != null) {
+			content.append(inputLine);
+		}
+		in.close();
+
+		log.debug("content = " + content);
+
+		return true;
+	}
+
+	public static void uploadZookeperConfig(final SolrZkClient zkClient,
+		final String coreName,
+		final boolean overwrite,
+		final String layout) {
+
+		final String basepath = CONFIGS_PATH + "/" + coreName;
+
+		log.info("uploading solr configuration to ZK for index collection: " + coreName);
+		try {
+			if (overwrite && zkClient.getSolrZooKeeper().exists(basepath, false) != null) {
+				log.info("cleanup ZK configuration: " + coreName);
+				for (String child : zkClient.getSolrZooKeeper().getChildren(basepath, false)) {
+					final String path = basepath + "/" + child;
+					log.debug("cleanup ZK file: " + path);
+					zkClient.delete(path, -1, true);
+				}
+				zkClient.delete(basepath, -1, true);
+			}
+			if (!zkClient.exists(basepath, true)) {
+				log.info("upload ZK configuration: " + coreName);
+				zkClient.makePath(basepath, true);
+				uploadConfiguration(zkClient, basepath, buildConfiguration(layout));
+			}
+			log.info("upload ZK configuration complete");
+		} catch (Exception e) {
+			throw new RuntimeException("unable to upload solr configuration", e);
+		}
+	}
+
+	private static void uploadConfiguration(final SolrZkClient zkClient, final String basePath,
+		final Map<String, byte[]> resources) throws KeeperException,
+		InterruptedException, IOException {
+
+		if (!zkClient.exists(basePath, true)) {
+			zkClient.makePath(basePath, true);
+		}
+
+		for (final Map.Entry<String, byte[]> e : resources.entrySet()) {
+			String path = basePath + "/" + e.getKey();
+			log.debug("upload ZK configuration: " + path);
+			zkClient.create(path, e.getValue(), CreateMode.PERSISTENT, true);
+		}
+	}
+
+	private static String loadFileInClassPath(final String aPath) {
+		System.out.println("LOAD FILE FROM PATH: " + aPath);
+		try {
+			return IOUtils
+				.toString(Objects.requireNonNull(SolrUtil.class.getResourceAsStream(aPath)), Charset.defaultCharset());
+		} catch (IOException e) {
+			return null;
+		}
+	}
+
+	public static Map<String, String> getServiceProperties() throws IOException {
+		final String properties = loadFileInClassPath(CONF_BASE_PATH + "service_properties.json");
+		final ObjectMapper mapper = new ObjectMapper();
+		TypeFactory typeFactory = mapper.getTypeFactory();
+		MapType mapType = typeFactory.constructMapType(HashMap.class, String.class, String.class);
+		return mapper.readValue(properties, mapType);
+	}
+
+	public static String getConfig() throws Exception {
+		final Map<String, String> p = getServiceProperties();
+		final String st = loadFileInClassPath(CONF_BASE_PATH + "solrconfig.xml.st");
+		final ST solrConfig = new ST(st, DELIMITER, DELIMITER);
+		p.forEach(solrConfig::add);
+		return solrConfig.render();
+	}
+
+	public static NamedList<Object> createCollection(CloudSolrClient client, String name, int numShards,
+		int replicationFactor, int maxShardsPerNode, String configName) throws SolrServerException, IOException {
+		ModifiableSolrParams modParams = new ModifiableSolrParams();
+		modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name());
+		modParams.set("name", name);
+		modParams.set("numShards", numShards);
+		modParams.set("replicationFactor", replicationFactor);
+		modParams.set("collection.configName", configName);
+		modParams.set("maxShardsPerNode", maxShardsPerNode);
+		QueryRequest request = new QueryRequest(modParams);
+		request.setPath("/admin/collections");
+		return client.request(request);
+	}
+
+	private static Map<String, byte[]> buildConfiguration(final String layout)
+		throws Exception {
+
+		Map<String, byte[]> res = new HashMap<>();
+
+		try {
+			log.debug("adding schema.xml to the resource map");
+			res.put("schema.xml", getSchemaXML(layout).getBytes());
+
+			res.put("solrconfig.xml", getConfig().getBytes());
+			log.debug("adding solrconfig.xml to the resource map");
+			String data = IOUtils
+				.toString(Objects.requireNonNull(SolrUtil.class.getResourceAsStream(LIST_FILE_BASE_PATH)));
+			Arrays.stream(data.split("\n")).forEach(s -> {
+				final String name = s.replace(CONF_BASE_PATH + "files/", "");
+				res
+					.put(
+						name,
+						Objects.requireNonNull(loadFileInClassPath(s)).getBytes(StandardCharsets.UTF_8));
+			});
+			return res;
+		} catch (Throwable e) {
+			throw new Exception("failed to build configuration", e);
+		}
+	}
+
+	public static String getSchemaXML(final String layout) throws Exception {
+
+		final Document fields = new SAXReader().read(new ByteArrayInputStream(layout.getBytes(StandardCharsets.UTF_8)));
+
+		Transformer transformer = TransformerFactory
+			.newInstance()
+			.newTransformer(
+				new DocumentSource(new SAXReader().read(SolrUtil.class.getResourceAsStream(SCHEMA_TEMPLATE_PATH))));
+		transformer.setParameter("textFieldType", "text_common");
+
+		final DocumentResult result = new DocumentResult();
+
+		transformer.transform(new DocumentSource(fields), result);
+		String xml = result.getDocument().asXML();
+
+		log.debug("new index schema:\n" + xml);
+
+		return xml;
+	}
+}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/scholix/ScholixToSolr.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/scholix/ScholixToSolr.java
@ -0,0 +1,121 @@
+
+package eu.dnetlib.dhp.oa.provision.scholix;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Objects;
+import java.util.stream.Collectors;
+
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.spark.api.java.function.MapFunction;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
+import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
+import eu.dnetlib.dhp.schema.sx.scholix.*;
+
+public class ScholixToSolr implements MapFunction<String, SolrInputDocument> {
+	final static ObjectMapper MAPPER = new ObjectMapper();
+
+	public static SerializableSolrInputDocument toSolrDocument(final String json) {
+		try {
+			final Scholix input = MAPPER.readValue(json, Scholix.class);
+			final SerializableSolrInputDocument output = new SerializableSolrInputDocument();
+
+			fillEntityField(output, input.getSource(), "source");
+			fillEntityField(output, input.getTarget(), "target");
+			final String cleanDate = GraphCleaningFunctions.cleanDate(input.getPublicationDate());
+
+			if (cleanDate != null)
+				output.addField("publication_date", GraphCleaningFunctions.normalizeDate(cleanDate));
+
+			if (input.getRelationship() != null && input.getRelationship().getName() != null)
+				output.addField("relation_name", input.getRelationship().getName());
+			else
+				return null;
+			if (input.getRelationship() != null && input.getRelationship().getInverse() != null)
+				output.addField("relation_inverse", input.getRelationship().getInverse());
+
+			if (input.getLinkprovider() != null) {
+				final List<String> linkProviders = input
+					.getLinkprovider()
+					.stream()
+					.map(ScholixEntityId::getName)
+					.filter(Objects::nonNull)
+					.collect(Collectors.toList());
+
+				output.addField("link_provider", linkProviders);
+			}
+			if (input.getPublisher() != null) {
+				final List<String> publishers = input
+					.getPublisher()
+					.stream()
+					.map(ScholixEntityId::getName)
+					.filter(Objects::nonNull)
+					.collect(Collectors.toList());
+				output.addField("publisher_name", publishers);
+			}
+
+			output.addField("__indexrecordidentifier", input.getIdentifier());
+			output.addField("__result", json);
+			return output;
+
+		} catch (IOException e) {
+			throw new RuntimeException("Error on convert Scholix");
+		}
+	}
+
+	private static void fillEntityField(final SerializableSolrInputDocument document, final ScholixResource resource,
+		final String prefix) {
+
+		document.addField(prefix + "_identifier", resource.getDnetIdentifier());
+		document.addField(prefix + "_type", resource.getObjectType());
+		document.addField(prefix + "_publication_date", resource.getPublicationDate());
+		document.addField(prefix + "_subtype", resource.getObjectSubType());
+
+		List<String> resourcePIDs = resource
+			.getIdentifier()
+			.stream()
+			.map(ScholixIdentifier::getIdentifier)
+			.collect(Collectors.toList());
+		document.addField(prefix + "_pid", resourcePIDs);
+
+		List<String> resourceSchemas = resource
+			.getIdentifier()
+			.stream()
+			.map(ScholixIdentifier::getSchema)
+			.collect(Collectors.toList());
+		document.addField(prefix + "_schema", resourceSchemas);
+
+		if (resource.getPublisher() != null) {
+
+			final List<String> publishers = resource
+				.getPublisher()
+				.stream()
+				.map(ScholixEntityId::getName)
+				.collect(Collectors.toList());
+			if (publishers.size() > 0)
+				document.addField(prefix + "_publisher", publishers);
+		}
+
+		if (resource.getCollectedFrom() != null) {
+
+			final List<String> collectedFrom = resource
+				.getCollectedFrom()
+				.stream()
+				.map(ScholixCollectedFrom::getProvider)
+				.filter(Objects::nonNull)
+				.map(ScholixEntityId::getName)
+				.collect(Collectors.toList());
+			if (collectedFrom.size() > 0)
+				document.addField(prefix + "_collected_from", collectedFrom);
+		}
+
+	}
+
+	@Override
+	public SerializableSolrInputDocument call(String s) throws Exception {
+		return toSolrDocument(s);
+	}
+}
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkIndexCollectionOnSOLR.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkIndexCollectionOnSOLR.java
@ -0,0 +1,102 @@
+
+package eu.dnetlib.dhp.sx.provision;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.io.IOException;
+import java.util.Objects;
+import java.util.Optional;
+
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.io.IOUtils;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Encoder;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.lucidworks.spark.util.SolrSupport;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.oa.provision.ProvisionConstants;
+import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
+import eu.dnetlib.dhp.oa.provision.scholix.ScholixToSolr;
+import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
+import eu.dnetlib.dhp.utils.ISLookupClientFactory;
+
+public class SparkIndexCollectionOnSOLR {
+
+	private static final Integer DEFAULT_BATCH_SIZE = 1000;
+
+	// LOGGER initialized
+	private static final Logger log = LoggerFactory.getLogger(SparkIndexCollectionOnSOLR.class);
+
+	public static void main(String[] args) throws IOException, ParseException {
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					Objects
+						.requireNonNull(
+							SparkIndexCollectionOnSOLR.class
+								.getResourceAsStream("/eu/dnetlib/dhp/sx/provision/index_solr_parameters.json"))));
+
+		parser.parseArgument(args);
+
+		final String cluster = parser.get("cluster");
+		log.info("Cluster is {}", cluster);
+
+		final String format = parser.get("format");
+		log.info("Index format name is {}", format);
+
+		final String isLookupUrl = parser.get("isURL");
+		log.info("isURL is {}", isLookupUrl);
+
+		final String inputPath = parser.get("inputPath");
+		log.info("inputPath: {}", inputPath);
+
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+		final Integer batchSize = Optional
+			.ofNullable(parser.get("batchSize"))
+			.map(Integer::valueOf)
+			.orElse(DEFAULT_BATCH_SIZE);
+		log.info("batchSize: {}", batchSize);
+
+		final SparkConf conf = new SparkConf();
+		conf.registerKryoClasses(new Class[] {
+			SerializableSolrInputDocument.class
+		});
+
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
+				final String zkHost = isLookup.getZkHost();
+				log.info("zkHost: {}", zkHost);
+				final String collection = ProvisionConstants.getCollectionName(format);
+				log.info("collection: {}", collection);
+				feedScholixToSOLRIndex(spark, inputPath, collection, batchSize, zkHost);
+			});
+	}
+
+	public static void feedScholixToSOLRIndex(final SparkSession spark, final String inputPath, final String collection,
+		Integer batchSize, final String zkHost) {
+		final JavaRDD<SolrInputDocument> docs = spark
+			.read()
+			.text(inputPath)
+			.as(Encoders.STRING())
+			.map(new ScholixToSolr(), Encoders.kryo(SolrInputDocument.class))
+			.toJavaRDD();
+		SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
+
+	}
+}
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/currency.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/currency.xml
@ -0,0 +1,67 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Example exchange rates file for CurrencyField type named "currency" in example schema -->
+
+<currencyConfig version="1.0">
+  <rates>
+    <!-- Updated from http://www.exchangerate.com/ at 2011-09-27 -->
+    <rate from="USD" to="ARS" rate="4.333871" comment="ARGENTINA Peso" />
+    <rate from="USD" to="AUD" rate="1.025768" comment="AUSTRALIA Dollar" />
+    <rate from="USD" to="EUR" rate="0.743676" comment="European Euro" />
+    <rate from="USD" to="BRL" rate="1.881093" comment="BRAZIL Real" />
+    <rate from="USD" to="CAD" rate="1.030815" comment="CANADA Dollar" />
+    <rate from="USD" to="CLP" rate="519.0996" comment="CHILE Peso" />
+    <rate from="USD" to="CNY" rate="6.387310" comment="CHINA Yuan" />
+    <rate from="USD" to="CZK" rate="18.47134" comment="CZECH REP. Koruna" />
+    <rate from="USD" to="DKK" rate="5.515436" comment="DENMARK Krone" />
+    <rate from="USD" to="HKD" rate="7.801922" comment="HONG KONG Dollar" />
+    <rate from="USD" to="HUF" rate="215.6169" comment="HUNGARY Forint" />
+    <rate from="USD" to="ISK" rate="118.1280" comment="ICELAND Krona" />
+    <rate from="USD" to="INR" rate="49.49088" comment="INDIA Rupee" />
+    <rate from="USD" to="XDR" rate="0.641358" comment="INTNL MON. FUND SDR" />
+    <rate from="USD" to="ILS" rate="3.709739" comment="ISRAEL Sheqel" />
+    <rate from="USD" to="JPY" rate="76.32419" comment="JAPAN Yen" />
+    <rate from="USD" to="KRW" rate="1169.173" comment="KOREA (SOUTH) Won" />
+    <rate from="USD" to="KWD" rate="0.275142" comment="KUWAIT Dinar" />
+    <rate from="USD" to="MXN" rate="13.85895" comment="MEXICO Peso" />
+    <rate from="USD" to="NZD" rate="1.285159" comment="NEW ZEALAND Dollar" />
+    <rate from="USD" to="NOK" rate="5.859035" comment="NORWAY Krone" />
+    <rate from="USD" to="PKR" rate="87.57007" comment="PAKISTAN Rupee" />
+    <rate from="USD" to="PEN" rate="2.730683" comment="PERU Sol" />
+    <rate from="USD" to="PHP" rate="43.62039" comment="PHILIPPINES Peso" />
+    <rate from="USD" to="PLN" rate="3.310139" comment="POLAND Zloty" />
+    <rate from="USD" to="RON" rate="3.100932" comment="ROMANIA Leu" />
+    <rate from="USD" to="RUB" rate="32.14663" comment="RUSSIA Ruble" />
+    <rate from="USD" to="SAR" rate="3.750465" comment="SAUDI ARABIA Riyal" />
+    <rate from="USD" to="SGD" rate="1.299352" comment="SINGAPORE Dollar" />
+    <rate from="USD" to="ZAR" rate="8.329761" comment="SOUTH AFRICA Rand" />
+    <rate from="USD" to="SEK" rate="6.883442" comment="SWEDEN Krona" />
+    <rate from="USD" to="CHF" rate="0.906035" comment="SWITZERLAND Franc" />
+    <rate from="USD" to="TWD" rate="30.40283" comment="TAIWAN Dollar" />
+    <rate from="USD" to="THB" rate="30.89487" comment="THAILAND Baht" />
+    <rate from="USD" to="AED" rate="3.672955" comment="U.A.E. Dirham" />
+    <rate from="USD" to="UAH" rate="7.988582" comment="UKRAINE Hryvnia" />
+    <rate from="USD" to="GBP" rate="0.647910" comment="UNITED KINGDOM Pound" />
+    
+    <!-- Cross-rates for some common currencies -->
+    <rate from="EUR" to="GBP" rate="0.869914" />  
+    <rate from="EUR" to="NOK" rate="7.800095" />  
+    <rate from="GBP" to="NOK" rate="8.966508" />  
+  </rates>
+</currencyConfig>
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/elevate.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/elevate.xml
@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- If this file is found in the config directory, it will only be
+     loaded once at startup.  If it is found in Solr's data
+     directory, it will be re-loaded every commit.
+
+   See http://wiki.apache.org/solr/QueryElevationComponent for more info
+
+-->
+<elevate>
+ <!-- Query elevation examples
+  <query text="foo bar">
+    <doc id="1" />
+    <doc id="2" />
+    <doc id="3" />
+  </query>
+
+for use with techproducts example
+ 
+  <query text="ipod">
+    <doc id="MA147LL/A" />  put the actual ipod at the top 
+    <doc id="IW-02" exclude="true" /> exclude this cable
+  </query>
+-->
+
+</elevate>
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/file_list
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/file_list
@ -0,0 +1,6 @@
+/eu/dnetlib/dhp/oa/provision/conf/files/currency.xml
+/eu/dnetlib/dhp/oa/provision/conf/files/elevate.xml
+/eu/dnetlib/dhp/oa/provision/conf/files/params.json
+/eu/dnetlib/dhp/oa/provision/conf/files/protwords.txt
+/eu/dnetlib/dhp/oa/provision/conf/files/stopwords.txt
+/eu/dnetlib/dhp/oa/provision/conf/files/synonyms.txt
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/params.json
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/params.json
@ -0,0 +1,20 @@
+{"params":{
+  "query":{
+    "defType":"edismax",
+    "q.alt":"*:*",
+    "rows":"10",
+    "fl":"*,score",
+    "":{"v":0}
+  },
+  "facets":{
+    "facet":"on",
+    "facet.mincount": "1",
+    "":{"v":0}
+  },
+ "velocity":{
+   "wt": "velocity",
+   "v.template":"browse",
+   "v.layout": "layout",
+   "":{"v":0}
+ }
+}}
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/protwords.txt
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/protwords.txt
@ -0,0 +1,21 @@
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-----------------------------------------------------------------------
+# Use a protected word file to protect against the stemmer reducing two
+# unrelated words to the same base word.
+
+# Some non-words that normally won't be encountered,
+# just to test that they won't be stemmed.
+dontstems
+zwhacky
+
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/stopwords.txt
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/stopwords.txt
@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+a
+an
+and
+are
+as
+at
+be
+but
+by
+for
+if
+in
+into
+is
+it
+no
+not
+of
+on
+or
+s
+such
+t
+that
+the
+their
+then
+there
+these
+they
+this
+to
+was
+will
+with
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/synonyms.txt
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/synonyms.txt
@ -0,0 +1,29 @@
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-----------------------------------------------------------------------
+#some test synonym mappings unlikely to appear in real input text
+aaafoo => aaabar
+bbbfoo => bbbfoo bbbbar
+cccfoo => cccbar cccbaz
+fooaaa,baraaa,bazaaa
+
+# Some synonym groups specific to this example
+GB,gib,gigabyte,gigabytes
+MB,mib,megabyte,megabytes
+Television, Televisions, TV, TVs
+#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming
+#after us won't split it into two words.
+
+# Synonym mappings can be used for spelling correction too
+pixima => pixma
+
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/schemaTemplate.xslt
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/schemaTemplate.xslt
@ -0,0 +1,549 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+	<xsl:output omit-xml-declaration="yes" indent="yes"/>
+
+	<xsl:template match="//FIELDS">
+
+		<xsl:param name="textFieldType" select="string('text_common')"/>
+		<xsl:variable name="smallcase" select="'abcdefghijklmnopqrstuvwxyz'"/>
+		<xsl:variable name="uppercase" select="'ABCDEFGHIJKLMNOPQRSTUVWXYZ'"/>
+
+		<!--
+		D-Net index schema template
+
+		CHANGELOG
+
+		0.1 : first release
+		0.2 : added preserveOriginal="1" for text field type in the index analyzer and catenateWords="1" for the query analyzer
+		0.3 : changed language for SnowballPorterFilterFactory to language="German2" (index/query) in the text field type
+		0.4 : added solr.ASCIIFoldingFilterFactory filter (index/query) in the text field type
+		0.5 : added long_keyword field type, to be used for objIdentifiers
+		0.6 : added field types for spellchecking
+		0.7 : added parameter for text field type
+		0.8 : added field _version_, needed by Solr 4.0.0 for the transaction log
+		0.9   : added type: text_en_splitting
+		0.91  : added type: ngramtext
+		0.92  : added schema optimizations, removing unnecessary stored fields
+		0.93  : added attribute preserveOriginal="1" to fieldtype ngramtext (query analysis) to improve matches
+		0.94  : updated and simplified ngramtext fieldtype
+		0.95  : update to solr 4.4, removed attribute "compress" from field definition, ngramfield doesn't support NGramFilterFactory anymore
+		0.96  : update to solr 4.9
+		0.97  : introduced field type string_ci supporting case insensitivity.
+		1.0   : updated to solr 6.6.0
+		 -->
+		<schema name="dnet" version="1.0">
+
+			<!-- Valid attributes for fields:
+			 name: mandatory - the name for the field
+			 type: mandatory - the name of a field type from the
+			   fieldTypes section
+			 indexed: true if this field should be indexed (searchable or sortable)
+			 stored: true if this field should be retrievable
+			 docValues: true if this field should have doc values. Doc values are
+			   useful (required, if you are using *Point fields) for faceting,
+			   grouping, sorting and function queries. Doc values will make the index
+			   faster to load, more NRT-friendly and more memory-efficient.
+			   They however come with some limitations: they are currently only
+			   supported by StrField, UUIDField, all Trie*Fields and *PointFields,
+			   and depending on the field type, they might require the field to be
+			   single-valued, be required or have a default value (check the
+			   documentation of the field type you're interested in for more information)
+			 multiValued: true if this field may contain multiple values per document
+			 omitNorms: (expert) set to true to omit the norms associated with
+			   this field (this disables length normalization and index-time
+			   boosting for the field, and saves some memory).  Only full-text
+			   fields or fields that need an index-time boost need norms.
+			   Norms are omitted for primitive (non-analyzed) types by default.
+			 termVectors: [false] set to true to store the term vector for a
+			   given field.
+			   When using MoreLikeThis, fields used for similarity should be
+			   stored for best performance.
+			 termPositions: Store position information with the term vector.
+			   This will increase storage costs.
+			 termOffsets: Store offset information with the term vector. This
+			   will increase storage costs.
+			 required: The field is required.  It will throw an error if the
+			   value does not exist
+			 default: a value that should be used if no value is specified
+			   when adding a document.
+			-->
+
+			<!-- field names should consist of alphanumeric or underscore characters only and
+			  not start with a digit.  This is not currently strictly enforced,
+			  but other field names will not have first class support from all components
+			  and back compatibility is not guaranteed.  Names with both leading and
+			  trailing underscores (e.g. _version_) are reserved.
+			-->
+
+			<xsl:for-each select="./FIELD">
+				<xsl:variable name="fieldname" select="translate(@name, $uppercase, $smallcase)"/>
+				<xsl:variable name="fieldtype">
+					<xsl:choose>
+						<xsl:when test="@type"><xsl:value-of select="@type"/></xsl:when>
+						<xsl:when test="@tokenizable='false'">string</xsl:when>
+						<xsl:otherwise>
+							<xsl:value-of select="$textFieldType"/>
+						</xsl:otherwise>
+					</xsl:choose>
+				</xsl:variable>
+				<xsl:variable name="isMultivalued">
+					<xsl:choose>
+						<xsl:when test="@multivalued='false'">false</xsl:when>
+						<xsl:otherwise>true</xsl:otherwise>
+					</xsl:choose>
+				</xsl:variable>
+				<xsl:variable name="isStored">
+					<xsl:choose>
+						<xsl:when test="@stored='true'">true</xsl:when>
+						<xsl:otherwise>false</xsl:otherwise>
+					</xsl:choose>
+				</xsl:variable>
+
+				<field name="{$fieldname}" type="{$fieldtype}" indexed="{@indexable}" stored="{normalize-space($isStored)}" multiValued="{normalize-space($isMultivalued)}"/>
+			</xsl:for-each>
+
+			<field name="__indexrecordidentifier" type="string" indexed="true" stored="true" multiValued="false" required="true"/>
+
+			<field name="__deleted" type="boolean" indexed="true" stored="false" default="false" omitNorms="true" omitTermFreqAndPositions="true"/>
+
+			<field name="__dsid" type="string" indexed="true" stored="true" omitNorms="true" omitTermFreqAndPositions="true"/>
+
+			<field name="__dsversion" type="pdate" indexed="true" stored="true" omitNorms="true" omitTermFreqAndPositions="true"/>
+
+			<field name="__result" type="string" indexed="false" stored="true" multiValued="false" docValues="false"/>
+
+			<field name="__all" type="{$textFieldType}" indexed="true" stored="false" multiValued="true"/>
+
+			<field name="_version_" type="long" indexed="true" stored="true" multiValued="false" />
+
+			<field name="_root_" type="string" indexed="true" stored="false" docValues="false" />
+
+			<!-- field for ping -->
+			<field name="text" type="{$textFieldType}" indexed="false" stored="false"/>
+
+			<!-- Field to use to determine and enforce document uniqueness.
+				 Unless this field is marked with required="false", it will be a required field
+			  -->
+			<uniqueKey>__indexrecordidentifier</uniqueKey>
+
+			<xsl:for-each select="./FIELD[@copy = 'true']">
+				<xsl:variable name="fieldname" select="translate(@name, $uppercase, $smallcase)"/>
+				<copyField source="{$fieldname}" dest="__all"/>
+			</xsl:for-each>
+
+			<!-- copyField commands copy one field to another at the time a document
+			   is added to the index.  It's used either to index the same field differently,
+			   or to add multiple fields to the same field for easier/faster searching.
+
+			<copyField source="sourceFieldName" dest="destinationFieldName"/>
+			-->
+
+			<!-- field type definitions. The "name" attribute is
+			   just a label to be used by field definitions.  The "class"
+			   attribute and any other attributes determine the real
+			   behavior of the fieldType.
+				 Class names starting with "solr" refer to java classes in a
+			   standard package such as org.apache.solr.analysis
+			-->
+
+			<!-- The StrField type is not analyzed, but indexed/stored verbatim.
+			   It supports doc values but in that case the field needs to be
+			   single-valued and either required or have a default value.
+			  -->
+			<fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" />
+			<fieldType name="strings" class="solr.StrField" sortMissingLast="true" multiValued="true" docValues="true" />
+
+			<!-- boolean type: "true" or "false" -->
+			<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
+
+			<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
+
+			<!-- sortMissingLast and sortMissingFirst attributes are optional attributes are
+				 currently supported on types that are sorted internally as strings
+				 and on numeric types.
+				 This includes "string","boolean", "int", "float", "long", "date", "double",
+				 including the "Trie" and "Point" variants.
+			   - If sortMissingLast="true", then a sort on this field will cause documents
+				 without the field to come after documents with the field,
+				 regardless of the requested sort order (asc or desc).
+			   - If sortMissingFirst="true", then a sort on this field will cause documents
+				 without the field to come before documents with the field,
+				 regardless of the requested sort order.
+			   - If sortMissingLast="false" and sortMissingFirst="false" (the default),
+				 then default lucene sorting will be used which places docs without the
+				 field first in an ascending sort and last in a descending sort.
+			-->
+
+			<!--
+			  Numeric field types that index values using KD-trees. *Point fields are faster and more efficient than Trie* fields both, at
+			  search time and at index time, but some features are still not supported.
+			  Point fields don't support FieldCache, so they must have docValues="true" if needed for sorting, faceting, functions, etc.
+			-->
+			<fieldType name="pint" class="solr.IntPointField" docValues="true"/>
+			<fieldType name="pfloat" class="solr.FloatPointField" docValues="true"/>
+			<fieldType name="plong" class="solr.LongPointField" docValues="true"/>
+			<fieldType name="pdouble" class="solr.DoublePointField" docValues="true"/>
+
+			<fieldType name="pints" class="solr.IntPointField" docValues="true" multiValued="true"/>
+			<fieldType name="pfloats" class="solr.FloatPointField" docValues="true" multiValued="true"/>
+			<fieldType name="plongs" class="solr.LongPointField" docValues="true" multiValued="true"/>
+			<fieldType name="pdoubles" class="solr.DoublePointField" docValues="true" multiValued="true"/>
+
+			<!--
+			  Default numeric field types. For faster range queries, consider *PointFields (pint/pfloat/plong/pdouble), or the
+			  tint/tfloat/tlong/tdouble types.
+			-->
+			<fieldType name="int" class="solr.TrieIntField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
+			<fieldType name="float" class="solr.TrieFloatField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
+			<fieldType name="long" class="solr.TrieLongField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
+			<fieldType name="double" class="solr.TrieDoubleField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
+
+			<fieldType name="ints" class="solr.TrieIntField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
+			<fieldType name="floats" class="solr.TrieFloatField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
+			<fieldType name="longs" class="solr.TrieLongField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
+			<fieldType name="doubles" class="solr.TrieDoubleField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
+
+			<!--
+			 Numeric field types that index each value at various levels of precision
+			 to accelerate range queries when the number of values between the range
+			 endpoints is large. See the javadoc for NumericRangeQuery for internal
+			 implementation details.
+
+			 Smaller precisionStep values (specified in bits) will lead to more tokens
+			 indexed per value, slightly larger index size, and faster range queries.
+			 A precisionStep of 0 disables indexing at different precision levels.
+
+			 Consider using pint/pfloat/plong/pdouble instead of Trie* fields if possible
+			-->
+			<fieldType name="tint" class="solr.TrieIntField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
+			<fieldType name="tfloat" class="solr.TrieFloatField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
+			<fieldType name="tlong" class="solr.TrieLongField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
+			<fieldType name="tdouble" class="solr.TrieDoubleField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
+
+			<fieldType name="tints" class="solr.TrieIntField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
+			<fieldType name="tfloats" class="solr.TrieFloatField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
+			<fieldType name="tlongs" class="solr.TrieLongField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
+			<fieldType name="tdoubles" class="solr.TrieDoubleField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
+
+			<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
+				 is a more restricted form of the canonical representation of dateTime
+				 http://www.w3.org/TR/xmlschema-2/#dateTime
+				 The trailing "Z" designates UTC time and is mandatory.
+				 Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
+				 All other components are mandatory.
+
+				 Expressions can also be used to denote calculations that should be
+				 performed relative to "NOW" to determine the value, ie...
+
+					   NOW/HOUR
+						  ... Round to the start of the current hour
+					   NOW-1DAY
+						  ... Exactly 1 day prior to now
+					   NOW/DAY+6MONTHS+3DAYS
+						  ... 6 months and 3 days in the future from the start of
+							  the current day
+
+				 Consult the TrieDateField javadocs for more information.
+			  -->
+			<!-- KD-tree versions of date fields -->
+			<fieldType name="pdate" class="solr.DatePointField" docValues="true"/>
+			<fieldType name="pdates" class="solr.DatePointField" docValues="true" multiValued="true"/>
+
+			<fieldType name="date" class="solr.TrieDateField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
+			<fieldType name="dates" class="solr.TrieDateField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
+
+			<fieldType name="tdate" class="solr.TrieDateField" docValues="true" precisionStep="6" positionIncrementGap="0"/>
+			<fieldType name="tdates" class="solr.TrieDateField" docValues="true" precisionStep="6" positionIncrementGap="0" multiValued="true"/>
+
+
+			<!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
+			<fieldType name="binary" class="solr.BinaryField"/>
+
+			<!-- The "RandomSortField" is not used to store or search any
+				 data.  You can declare fields of this type it in your schema
+				 to generate pseudo-random orderings of your docs for sorting
+				 or function purposes.  The ordering is generated based on the field
+				 name and the version of the index. As long as the index version
+				 remains unchanged, and the same field name is reused,
+				 the ordering of the docs will be consistent.
+				 If you want different psuedo-random orderings of documents,
+				 for the same version of the index, use a dynamicField and
+				 change the field name in the request.
+			 -->
+			<fieldType name="random" class="solr.RandomSortField" indexed="true" />
+
+			<!-- solr.TextField allows the specification of custom text analyzers
+				 specified as a tokenizer and a list of token filters. Different
+				 analyzers may be specified for indexing and querying.
+
+				 The optional positionIncrementGap puts space between multiple fields of
+				 this type on the same document, with the purpose of preventing false phrase
+				 matching across fields.
+
+				 For more info on customizing your analyzer chain, please see
+				 http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
+			 -->
+
+			<!-- One can also specify an existing Analyzer class that has a
+				 default constructor via the class attribute on the analyzer element.
+				 Example:
+			<fieldType name="text_greek" class="solr.TextField">
+			  <analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/>
+			</fieldType>
+			-->
+
+			<!-- A text field that only splits on whitespace for exact matching of words -->
+			<!-- <dynamicField name="*_ws" type="text_ws"  indexed="true"  stored="true"/> -->
+
+			<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
+				<analyzer>
+					<tokenizer class="solr.WhitespaceTokenizerFactory"/>
+				</analyzer>
+			</fieldType>
+
+			<fieldType name="ngramtext" class="solr.TextField">
+				<analyzer type="index">
+					<tokenizer class="solr.KeywordTokenizerFactory"/>
+					<filter class="solr.LowerCaseFilterFactory"/>
+					<filter class="solr.NGramFilterFactory" minGramSize="3" maxGramSize="25"/>
+					<filter class="solr.TrimFilterFactory"/>
+					<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+				</analyzer>
+				<analyzer type="query">
+					<tokenizer class="solr.KeywordTokenizerFactory"/>
+					<filter class="solr.LowerCaseFilterFactory"/>
+				</analyzer>
+			</fieldType>
+
+
+			<fieldType name="personName" class="solr.TextField"  positionIncrementGap="100">
+				<analyzer>
+					<tokenizer class="solr.StandardTokenizerFactory" />
+					<filter class="solr.LowerCaseFilterFactory" />
+				</analyzer>
+			</fieldType>
+
+			<fieldType name="personNamePrefix" class="solr.TextField"  positionIncrementGap="100">
+				<analyzer type="index">
+					<tokenizer class="solr.StandardTokenizerFactory"/>
+					<filter class="solr.LowerCaseFilterFactory" />
+					<filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="30" />
+				</analyzer>
+				<analyzer type="query">
+					<tokenizer class="solr.StandardTokenizerFactory"/>
+					<filter class="solr.LowerCaseFilterFactory" />
+				</analyzer>
+			</fieldType>
+
+
+			<!-- A general text field that has reasonable, generic
+				 cross-language defaults: it tokenizes with StandardTokenizer,
+				   removes stop words from case-insensitive "stopwords.txt"
+				   (empty by default), and down cases.  At query time only, it
+				   also applies synonyms.
+			  -->
+			<fieldType name="text_common" class="solr.TextField" positionIncrementGap="100" multiValued="true">
+				<analyzer type="index">
+					<tokenizer class="solr.StandardTokenizerFactory"/>
+					<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
+					<!-- in this example, we will only use synonyms at query time
+					<filter class="solr.SynonymGraphFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+					<filter class="solr.FlattenGraphFilterFactory"/>
+					-->
+					<filter class="solr.LowerCaseFilterFactory"/>
+				</analyzer>
+				<analyzer type="query">
+					<tokenizer class="solr.StandardTokenizerFactory"/>
+					<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
+					<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+					<filter class="solr.LowerCaseFilterFactory"/>
+				</analyzer>
+			</fieldType>
+
+			<fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
+				<analyzer type="index">
+					<tokenizer class="solr.StandardTokenizerFactory"/>
+					<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
+					<filter class="solr.LowerCaseFilterFactory"/>
+					<filter class="solr.EnglishPossessiveFilterFactory"/>
+					<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+					<filter class="solr.PorterStemFilterFactory"/>
+				</analyzer>
+				<analyzer type="query">
+					<tokenizer class="solr.StandardTokenizerFactory"/>
+					<filter class="solr.SynonymGraphFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
+					<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
+					<filter class="solr.LowerCaseFilterFactory"/>
+					<filter class="solr.EnglishPossessiveFilterFactory"/>
+					<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+					<filter class="solr.PorterStemFilterFactory"/>
+				</analyzer>
+			</fieldType>
+
+			<!-- A text field with defaults appropriate for English, plus
+				 aggressive word-splitting and autophrase features enabled.
+				 This field is just like text_en, except it adds
+				 WordDelimiterGraphFilter to enable splitting and matching of
+				 words on case-change, alpha numeric boundaries, and
+				 non-alphanumeric chars.  This means certain compound word
+				 cases will work, for example query "wi fi" will match
+				 document "WiFi" or "wi-fi".
+			-->
+			<!-- <dynamicField name="*_txt_en_split" type="text_en_splitting"  indexed="true"  stored="true"/> -->
+			<fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+				<analyzer type="index">
+					<tokenizer class="solr.WhitespaceTokenizerFactory"/>
+					<!-- in this example, we will only use synonyms at query time
+					<filter class="solr.SynonymGraphFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+					-->
+					<!-- Case insensitive stop word removal.
+					-->
+					<filter class="solr.StopFilterFactory"
+							ignoreCase="true"
+							words="stopwords.txt"
+					/>
+					<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+					<filter class="solr.LowerCaseFilterFactory"/>
+					<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+					<filter class="solr.PorterStemFilterFactory"/>
+					<filter class="solr.FlattenGraphFilterFactory" />
+				</analyzer>
+				<analyzer type="query">
+					<tokenizer class="solr.WhitespaceTokenizerFactory"/>
+					<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+					<filter class="solr.StopFilterFactory"
+							ignoreCase="true"
+							words="stopwords.txt"
+					/>
+					<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+					<filter class="solr.LowerCaseFilterFactory"/>
+					<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+					<filter class="solr.PorterStemFilterFactory"/>
+				</analyzer>
+			</fieldType>
+
+			<!-- Less flexible matching, but less false matches.  Probably not ideal for product names,
+				 but may be good for SKUs.  Can insert dashes in the wrong place and still match. -->
+			<!-- <dynamicField name="*_txt_en_split_tight" type="text_en_splitting_tight"  indexed="true"  stored="true"/> -->
+			<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+				<analyzer type="index">
+					<tokenizer class="solr.WhitespaceTokenizerFactory"/>
+					<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+					<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+					<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+					<filter class="solr.LowerCaseFilterFactory"/>
+					<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+					<filter class="solr.EnglishMinimalStemFilterFactory"/>
+					<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+						 possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
+					<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+					<filter class="solr.FlattenGraphFilterFactory" />
+				</analyzer>
+				<analyzer type="query">
+					<tokenizer class="solr.WhitespaceTokenizerFactory"/>
+					<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+					<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+					<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+					<filter class="solr.LowerCaseFilterFactory"/>
+					<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+					<filter class="solr.EnglishMinimalStemFilterFactory"/>
+					<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+						 possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
+					<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+				</analyzer>
+			</fieldType>
+
+			<!-- Just like text_common except it reverses the characters of
+				   each token, to enable more efficient leading wildcard queries.
+			-->
+			<!-- <dynamicField name="*_txt_rev" type="text_common_rev"  indexed="true"  stored="true"/> -->
+			<fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100">
+				<analyzer type="index">
+					<tokenizer class="solr.StandardTokenizerFactory"/>
+					<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
+					<filter class="solr.LowerCaseFilterFactory"/>
+					<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
+							maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
+				</analyzer>
+				<analyzer type="query">
+					<tokenizer class="solr.StandardTokenizerFactory"/>
+					<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+					<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
+					<filter class="solr.LowerCaseFilterFactory"/>
+				</analyzer>
+			</fieldType>
+
+			<!-- <dynamicField name="*_phon_en" type="phonetic_en"  indexed="true"  stored="true"/> -->
+			<fieldType name="phonetic_en" stored="false" indexed="true" class="solr.TextField" >
+				<analyzer>
+					<tokenizer class="solr.StandardTokenizerFactory"/>
+					<filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
+				</analyzer>
+			</fieldType>
+
+			<fieldType name="string_ci" class="solr.TextField" sortMissingLast="true" omitNorms="true">
+				<analyzer type="query">
+					<tokenizer class="solr.KeywordTokenizerFactory"/>
+					<filter class="solr.LowerCaseFilterFactory"/>
+				</analyzer>
+			</fieldType>
+
+			<!--
+			  Example of using PathHierarchyTokenizerFactory at index time, so
+			  queries for paths match documents at that path, or in descendent paths
+			-->
+			<!-- <dynamicField name="*_descendent_path" type="descendent_path"  indexed="true"  stored="true"/> -->
+			<fieldType name="descendent_path" class="solr.TextField">
+				<analyzer type="index">
+					<tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
+				</analyzer>
+				<analyzer type="query">
+					<tokenizer class="solr.KeywordTokenizerFactory" />
+				</analyzer>
+			</fieldType>
+
+			<!--
+			  Example of using PathHierarchyTokenizerFactory at query time, so
+			  queries for paths match documents at that path, or in ancestor paths
+			-->
+			<!-- <dynamicField name="*_ancestor_path" type="ancestor_path"  indexed="true"  stored="true"/> -->
+			<fieldType name="ancestor_path" class="solr.TextField">
+				<analyzer type="index">
+					<tokenizer class="solr.KeywordTokenizerFactory" />
+				</analyzer>
+				<analyzer type="query">
+					<tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
+				</analyzer>
+			</fieldType>
+
+			<!-- since fields of this type are by default not stored or indexed,
+				 any data added to them will be ignored outright.  -->
+			<fieldType name="ignored" stored="false" indexed="false" docValues="false" multiValued="true" class="solr.StrField" />
+
+			<!-- This point type indexes the coordinates as separate fields (subFields)
+			  If subFieldType is defined, it references a type, and a dynamic field
+			  definition is created matching *___<typename>.  Alternately, if
+			  subFieldSuffix is defined, that is used to create the subFields.
+			  Example: if subFieldType="double", then the coordinates would be
+				indexed in fields myloc_0___double,myloc_1___double.
+			  Example: if subFieldSuffix="_d" then the coordinates would be indexed
+				in fields myloc_0_d,myloc_1_d
+			  The subFields are an implementation detail of the fieldType, and end
+			  users normally should not need to know about them.
+			 -->
+			<!-- <dynamicField name="*_point" type="point"  indexed="true"  stored="true"/> -->
+			<fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>
+
+			<!-- A specialized field for geospatial search filters and distance sorting. -->
+			<fieldType name="location" class="solr.LatLonPointSpatialField" docValues="true"/>
+
+			<!-- An alternative geospatial field type new to Solr 4.  It supports multiValued and polygon shapes.
+			  For more information about this and other Spatial fields new to Solr 4, see:
+			  http://wiki.apache.org/solr/SolrAdaptersForLuceneSpatial4
+			-->
+			<fieldType name="location_rpt" class="solr.SpatialRecursivePrefixTreeFieldType"
+					   geo="true" distErrPct="0.025" maxDistErr="0.001" distanceUnits="kilometers" />
+
+		</schema>
+	</xsl:template>
+</xsl:stylesheet>
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/service_properties.json
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/service_properties.json
@ -0,0 +1,14 @@
+ {
+    "id":"solr",
+    "address":"localhost:9983",
+    "port":"8983",
+    "webContext":"solr",
+    "numShards":"4",
+    "replicationFactor":"1",
+	"maxShardsPerNode":"4",
+    "host":"localhost",
+    "luceneMatchVersion":"7.5.0",
+    "feedingShutdownTolerance":"30000",
+    "feedingBufferFlushThreshold":"1000",
+    "feedingSimulationMode":"false"
+ }
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/smf.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/smf.xml
@ -0,0 +1,23 @@
+<FIELDS><!-- SOURCE FIELD -->
+    <FIELD indexable="true" name="source_identifier" stored="true" stat="false" tokenizable="false" value="None"/>
+    <FIELD indexable="true" name="source_type" stored="true" stat="false" tokenizable="false" value="None"/>
+    <FIELD indexable="false" name="source_publication_date" stored="true" stat="false" tokenizable="false" value="None"/>
+    <FIELD indexable="true" name="source_subType" stored="true" stat="false" tokenizable="false" value="None"/>
+    <FIELD indexable="true" name="source_pid" stored="true" stat="false" tokenizable="false" value="None"/>
+    <FIELD indexable="true" name="source_schema" stored="true" stat="false" tokenizable="false" value="None"/>
+    <FIELD indexable="true" name="source_publisher" tokenizable="true" stored="true" stat="false" xpath="None"/>
+    <FIELD indexable="true" name="source_collected_from" tokenizable="true" stored="true" stat="false" xpath="None"/><!-- TARGET FIELD -->
+    <FIELD indexable="true" name="target_identifier" stored="true" stat="false" tokenizable="false" value="None"/>
+    <FIELD indexable="true" name="target_type" stored="true" stat="false" tokenizable="false" value="None"/>
+    <FIELD indexable="true" name="target_subType" stored="true" stat="false" tokenizable="false" value="None"/>
+    <FIELD indexable="true" name="target_pid" stored="true" stat="false" tokenizable="false" value="None"/>
+    <FIELD indexable="true" name="target_schema" stored="true" stat="false" tokenizable="false" value="None"/>
+    <FIELD indexable="true" name="target_publisher" tokenizable="true" stored="true" stat="false" xpath="None"/>
+    <FIELD indexable="true" name="target_collected_from" tokenizable="true" stored="true" stat="false" xpath="None"/>
+    <FIELD indexable="false" name="target_publication_date" stored="true" stat="false" tokenizable="false" value="None"/><!-- RELATION FIELD -->
+    <FIELD indexable="true" name="publicationDate" multivalued="false" stored="true" stat="false" type="pdate" value="None"/>
+    <FIELD indexable="true" name="relation_name" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
+    <FIELD indexable="true" name="relation_inverse" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
+    <FIELD indexable="true" name="publisher_name" tokenizable="ture" stored="true" stat="false" xpath="None"/>
+    <FIELD indexable="true" name="linkprovider" tokenizable="ture" stored="true" stat="false" xpath="None"/>
+</FIELDS>
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/solrconfig.xml.st
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/solrconfig.xml.st
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/provision/index_solr_parameters.json
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/provision/index_solr_parameters.json
@ -0,0 +1,32 @@
+[
+   {
+      "paramName":"c",
+      "paramLongName":"cluster",
+      "paramDescription":"should be cluster1 or cluster2",
+      "paramRequired":true
+   },
+   {
+      "paramName":"is",
+      "paramLongName":"isURL",
+      "paramDescription":"the Information Service LookUp URL",
+      "paramRequired":true
+   },
+   {
+      "paramName":"ip",
+      "paramLongName":"inputPath",
+      "paramDescription":"the source input path",
+      "paramRequired":true
+   },
+   {
+      "paramName":"b",
+      "paramLongName":"batchSize",
+      "paramDescription":"the batch size param",
+      "paramRequired":false
+   },
+   {
+      "paramName":"f",
+      "paramLongName":"format",
+      "paramDescription":"index metadata format name",
+      "paramRequired":true
+   }
+]
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/solr/provision/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/solr/provision/oozie_app/config-default.xml
@ -0,0 +1,14 @@
+<configuration>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/solr/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/solr/provision/oozie_app/workflow.xml
@ -0,0 +1,113 @@
+    <workflow-app name="Index Scholexplorer Infospace" xmlns="uri:oozie:workflow:0.5">
+        <parameters>
+            <property>
+                <name>sourcePath</name>
+                <description>the sourcePath of the json RDDs</description>
+            </property>
+            <property>
+                <name>isLookupUrl</name>
+                <description>URL for the isLookup service</description>
+            </property>
+            <property>
+                <name>solrDeletionQuery</name>
+                <value>*:*</value>
+                <description>query used in the deleted by query operation</description>
+            </property>
+            <property>
+                <name>format</name>
+                <description>metadata format name (SMF)</description>
+            </property>
+
+        </parameters>
+
+        <start to="indexScholix"/>
+
+        <kill name="Kill">
+            <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+        </kill>
+
+        <action name="drop_solr_collection">
+            <java>
+                <configuration>
+                    <property>
+                        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+                        <value>true</value>
+                    </property>
+                </configuration>
+                <main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
+                <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
+                <arg>--format</arg><arg>${format}</arg>
+                <arg>--action</arg><arg>DELETE_BY_QUERY</arg>
+                <arg>--query</arg><arg>${solrDeletionQuery}</arg>
+                <arg>--commit</arg><arg>true</arg>
+            </java>
+            <ok to="create_solr_index"/>
+            <error to="Kill"/>
+        </action>
+
+        <action name="create_solr_index">
+            <java>
+                <configuration>
+                    <property>
+                        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+                        <value>true</value>
+                    </property>
+                </configuration>
+                <main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
+                <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
+                <arg>--format</arg><arg>${format}</arg>
+                <arg>--action</arg><arg>CREATE</arg>
+
+            </java>
+            <ok to="indexScholix"/>
+            <error to="Kill"/>
+        </action>
+
+        <action name="indexScholix">
+            <spark xmlns="uri:oozie:spark-action:0.2">
+                <master>yarn</master>
+                <mode>cluster</mode>
+                <name>Index summary</name>
+                <class>eu.dnetlib.dhp.sx.provision.SparkIndexCollectionOnSOLR</class>
+                <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+                <spark-opts>
+                    --conf spark.dynamicAllocation.enabled=true
+                    --conf spark.shuffle.service.enabled=true
+                    --executor-memory=${sparkExecutorMemory}
+                    --conf spark.dynamicAllocation.maxExecutors="16"
+                    --driver-memory=${sparkDriverMemory}
+                    --conf spark.extraListeners=${spark2ExtraListeners}
+                    --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                    --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                    --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                </spark-opts>
+                <arg>--cluster</arg><arg>yarn</arg>
+                <arg>--isURL</arg><arg>${isLookupUrl}</arg>
+                <arg>--inputPath</arg><arg>${sourcePath}</arg>
+                <arg>--format</arg><arg>${format}</arg>
+
+            </spark>
+            <ok to="commit_solr_collection"/>
+            <error to="Kill"/>
+        </action>
+
+        <action name="commit_solr_collection">
+            <java>
+                <configuration>
+                    <property>
+                        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+                        <value>true</value>
+                    </property>
+                </configuration>
+                <main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
+                <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
+                <arg>--format</arg><arg>${format}</arg>
+                <arg>--action</arg><arg>COMMIT</arg>
+            </java>
+            <ok to="End"/>
+            <error to="Kill"/>
+        </action>
+
+
+        <end name="End"/>
+    </workflow-app>
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/ScholixIndexingTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/ScholixIndexingTest.java
@ -0,0 +1,185 @@
+
+package eu.dnetlib.dhp.oa.provision;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.io.*;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.*;
+import java.util.stream.Collectors;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.impl.CloudSolrClient;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrDocumentList;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.params.MapSolrParams;
+import org.dom4j.Document;
+import org.dom4j.DocumentException;
+import org.dom4j.Element;
+import org.dom4j.Node;
+import org.dom4j.io.SAXReader;
+import org.junit.jupiter.api.*;
+
+import eu.dnetlib.dhp.oa.provision.scholix.ScholixToSolr;
+
+@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
+public class ScholixIndexingTest extends SolrTest {
+
+	private static String LAYOUT_PATH = "/eu/dnetlib/dhp/oa/provision/SMF_layout.xml";
+
+	/**
+	 * This test verifies that the schema will be generated correctly
+	 * by get the profile of the  metadataFormat and generating solr schema.xml
+	 * we expect that the fiedl in the metadataFormat are all in the field solr schema
+	 * @throws Exception
+	 */
+	@Test
+	@Order(1)
+	void testSchemaCreation() throws Exception {
+
+		final String layout = loadSMFLayout();
+		assertNotNull(layout);
+		assertTrue(StringUtils.isNotBlank(layout));
+
+		final String scheme = SolrUtil.getSchemaXML(loadSMFLayout());
+		assertNotNull(scheme);
+		assertTrue(StringUtils.isNotBlank(scheme));
+
+		final Document fields = parseDocument(layout);
+		List<Node> params = fields.selectNodes("//FIELD");
+		final List<String> exptectedFieldName = new ArrayList<>();
+		for (Node param : params) {
+			Element element = (Element) param;
+			String name = element.attributeValue("name");
+			exptectedFieldName.add(name.toLowerCase());
+		}
+		assertTrue(exptectedFieldName.size() > 0);
+
+		final Document parsedScheme = parseDocument(scheme);
+		params = parsedScheme.selectNodes("//field");
+		final List<String> createdFieldName = new ArrayList<>();
+		for (Node param : params) {
+
+			Element element = (Element) param;
+			String name = element.attributeValue("name");
+			createdFieldName.add(name.toLowerCase());
+		}
+		assertTrue(createdFieldName.size() > 0);
+
+		exptectedFieldName.stream().map(createdFieldName::contains).forEach(Assertions::assertTrue);
+	}
+
+	/***
+	 * Test the creation of the index works
+	 * we test if all the files are uploaded into
+	 * the zookeeper instance of SOLR under it's
+	 * collection name
+	 * @throws Exception
+	 */
+	@Test
+	@Order(2)
+	public void testCreateCollection() throws Exception {
+		final String collectionName = "SMF-index-scholix";
+		SolrUtil.uploadZookeperConfig(miniCluster.getZkClient(), collectionName, true, loadSMFLayout());
+
+		assertTrue(miniCluster.getZkClient().exists("/configs/" + collectionName, true));
+		List<String> items = miniCluster.getZkClient().getChildren("/configs/" + collectionName, null, true);
+
+		List<String> configurationFiles = Files
+			.list(
+				Paths
+					.get(
+						Objects.requireNonNull(getClass().getResource(SolrUtil.CONF_BASE_PATH + "files/")).getPath()))
+			.map(Path::getFileName)
+			.filter(p -> !p.getFileName().toString().equalsIgnoreCase("file_list"))
+			.map(Path::toString)
+			.collect(Collectors.toList());
+		configurationFiles.add("schema.xml");
+		configurationFiles.add("solrconfig.xml");
+		configurationFiles.forEach(s -> assertTrue(items.contains(s)));
+
+		SolrUtil.createCollection(miniCluster.getSolrClient(), "Scholix", 4, 1, 2, collectionName);
+
+		log.debug("Collection Created");
+		final Map<String, String> queryParamMap = new HashMap<>();
+		queryParamMap.put("q", "*:*");
+
+		MapSolrParams queryParams = new MapSolrParams(queryParamMap);
+		final QueryResponse response = miniCluster.getSolrClient().query("Scholix", queryParams);
+		final SolrDocumentList documents = response.getResults();
+		assertEquals(0, documents.getNumFound());
+
+	}
+
+	@Test
+	@Order(3)
+	public void testFeedingSolrDocument() throws Exception {
+
+		InputStream gzipStream = new GZIPInputStream(
+			Objects.requireNonNull(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/provision/scholix_records.gz")));
+		Reader decoder = new InputStreamReader(gzipStream, StandardCharsets.UTF_8);
+		BufferedReader buffered = new BufferedReader(decoder);
+		String line = buffered.readLine();
+
+		final CloudSolrClient client = miniCluster.getSolrClient();
+		client.setDefaultCollection("Scholix");
+		int added = 0;
+		while (line != null) {
+
+			final SolrInputDocument solrDocument = ScholixToSolr.toSolrDocument(line);
+
+			client.add(solrDocument);
+			added++;
+			line = buffered.readLine();
+		}
+
+		client.commit();
+
+		log.debug(String.format("Feed %d documents", added));
+
+		final SolrDocumentList documents = executeQuery("*:*");
+		assertEquals(added, documents.getNumFound());
+
+		documents.stream().map(s -> s.getFirstValue("source_pid").toString()).forEach(System.out::println);
+
+		SolrDocumentList source_pids = executeQuery("source_pid:\"10.15468/dl.u47azs\"");
+
+		System.out.println("source_pid.getNumFound() = " + source_pids.getNumFound());
+
+		source_pids.stream().map(s -> s.getFieldValue("source_pid")).forEach(System.out::println);
+
+	}
+
+	private SolrDocumentList executeQuery(final String query) throws SolrServerException, IOException {
+
+		final Map<String, String> queryParamMap = new HashMap<>();
+		queryParamMap.put("q", query);
+
+		MapSolrParams queryParams = new MapSolrParams(queryParamMap);
+		final QueryResponse response = miniCluster.getSolrClient().query("Scholix", queryParams);
+		return response.getResults();
+	}
+
+	/***
+	 * Utility for parsing XML
+	 * @param xml
+	 * @return Dom4J Document
+	 * @throws DocumentException
+	 */
+	private Document parseDocument(final String xml) throws DocumentException {
+		return new SAXReader().read(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+	}
+
+	private String loadSMFLayout() throws IOException {
+		return IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream(LAYOUT_PATH)));
+	}
+
+}
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplicationTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplicationTest.java
@ -2,11 +2,9 @@
 package eu.dnetlib.dhp.oa.provision;

 import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertTrue;

 import org.apache.solr.client.solrj.response.SolrPingResponse;
 import org.apache.solr.client.solrj.response.UpdateResponse;
-import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;

 class SolrAdminApplicationTest extends SolrTest {
@ -24,7 +22,7 @@ class SolrAdminApplicationTest extends SolrTest {
 		SolrAdminApplication admin = new SolrAdminApplication(miniCluster.getSolrClient().getZkHost());

 		UpdateResponse rsp = (UpdateResponse) admin
-			.execute(SolrAdminApplication.Action.DELETE_BY_QUERY, DEFAULT_COLLECTION, "*:*", false);
+			.execute(SolrAdminApplication.Action.DELETE_BY_QUERY, DEFAULT_COLLECTION, "*:*", false, null);

 		assertEquals(0, rsp.getStatus());
 	}
@ -38,5 +36,4 @@ class SolrAdminApplicationTest extends SolrTest {

 		assertEquals(0, rsp.getStatus());
 	}
-
 }
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/SMF_layout.xml
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/SMF_layout.xml
@ -0,0 +1,31 @@
+<LAYOUT name="index">
+    <FIELDS>
+
+        <!-- SOURCE FIELD -->
+        <FIELD indexable="true" name="source_identifier" multivalued="false" stored="false" stat="false" tokenizable="false" value="None"/>
+        <FIELD indexable="true" name="source_type" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
+        <FIELD indexable="false" name="source_publication_date" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
+        <FIELD indexable="true" name="source_subtype" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
+        <FIELD indexable="true" name="source_pid" multivalued="true" stored="true" stat="false" tokenizable="false" value="None"/>
+        <FIELD indexable="true" name="source_schema" multivalued="true" stored="true" stat="false" tokenizable="false" value="None"/>
+        <FIELD indexable="true" name="source_publisher" multivalued="true" tokenizable="true" stored="true" stat="false" xpath="None"/>
+        <FIELD indexable="true" name="source_collected_from" multivalued="true" tokenizable="true" stored="true" stat="false" xpath="None"/>
+
+        <!-- TARGET FIELD -->
+        <FIELD indexable="true"  name="target_identifier" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
+        <FIELD indexable="true"  name="target_type" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
+        <FIELD indexable="false" name="target_publication_date" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
+        <FIELD indexable="true"  name="target_subtype" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
+        <FIELD indexable="true"  name="target_pid" multivalued="true" stored="true" stat="false" tokenizable="false" value="None"/>
+        <FIELD indexable="true"  name="target_schema" multivalued="true" stored="true" stat="false" tokenizable="false" value="None"/>
+        <FIELD indexable="true"  name="target_publisher" multivalued="true" tokenizable="true" stored="true" stat="false" xpath="None"/>
+        <FIELD indexable="true"  name="target_collected_from" multivalued="true" tokenizable="true" stored="true" stat="false" xpath="None"/>
+
+        <!-- RELATION FIELD -->
+        <FIELD indexable="true" name="publication_date" multivalued="false" stored="true" stat="false" type="date" value="None"/>
+        <FIELD indexable="true" name="relation_name" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
+        <FIELD indexable="true" name="relation_inverse" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
+        <FIELD indexable="true" name="publisher_name" multivalued="true" tokenizable="ture" stored="true" stat="false" xpath="None"/>
+        <FIELD indexable="true" name="link_provider"  multivalued="true" tokenizable="ture" stored="true" stat="false" xpath="None"/>
+    </FIELDS>
+</LAYOUT>
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/scholix_records.gz
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/scholix_records.gz
--- a/dhp-workflows/pom.xml
+++ b/dhp-workflows/pom.xml
@ -352,7 +352,9 @@
                                </goals>
                                <configuration>
                                    <tasks>
+                                        <!--suppress UnresolvedMavenProperty -->
                                        <property name="assembly-resources.loc" value="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" />
+                                        <!--suppress UnresolvedMavenProperty -->
                                        <unjar src="${assembly-resources.loc}" dest="${project.build.directory}/assembly-resources" />
                                    </tasks>
                                </configuration>
@ -427,9 +429,12 @@
                                <configuration>
                                    <executable>ssh</executable>
                                    <arguments>
+                                        <!--suppress UnresolvedMavenProperty -->
                                        <argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument>
+                                        <!--suppress UnresolvedMavenProperty -->
                                        <argument>-p ${dhp.hadoop.frontend.port.ssh}</argument>
                                        <argument>-o StrictHostKeyChecking=no</argument>
+                                        <!--suppress UnresolvedMavenProperty -->
                                        <argument>rm -rf ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; mkdir -p ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/</argument>
                                    </arguments>
                                </configuration>
@ -443,9 +448,11 @@
                                <configuration>
                                    <executable>scp</executable>
                                    <arguments>
+                                        <!--suppress UnresolvedMavenProperty -->
                                        <argument>-P ${dhp.hadoop.frontend.port.ssh}</argument>
                                        <argument>-o StrictHostKeyChecking=no</argument>
                                        <argument>target/${oozie.package.file.name}.tar.gz</argument>
+                                        <!--suppress UnresolvedMavenProperty -->
                                        <argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}:${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/${oozie.package.file.name}.tar.gz</argument>
                                    </arguments>
                                </configuration>
@ -460,11 +467,15 @@
                                    <executable>ssh</executable>
                                    <!-- <outputFile>target/redirected_upload.log</outputFile> -->
                                    <arguments>
+                                        <!--suppress UnresolvedMavenProperty -->
                                        <argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument>
+                                        <!--suppress UnresolvedMavenProperty -->
                                        <argument>-p ${dhp.hadoop.frontend.port.ssh}</argument>
                                        <argument>-o StrictHostKeyChecking=no</argument>
+                                        <!--suppress UnresolvedMavenProperty -->
                                        <argument>cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; </argument>
                                        <argument>tar -zxf oozie-package.tar.gz; </argument>
+                                        <!--suppress UnresolvedMavenProperty -->
                                        <argument>rm ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/oozie-package.tar.gz; </argument>
                                        <argument>./upload_workflow.sh</argument>
                                    </arguments>
@ -495,9 +506,12 @@
                                    <!-- this file will be used by test verification profile reading job identifier -->
                                    <outputFile>${oozie.execution.log.file.location}</outputFile>
                                    <arguments>
+                                        <!--suppress UnresolvedMavenProperty -->
                                        <argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument>
+                                        <!--suppress UnresolvedMavenProperty -->
                                        <argument>-p ${dhp.hadoop.frontend.port.ssh}</argument>
                                        <argument>-o StrictHostKeyChecking=no</argument>
+                                        <!--suppress UnresolvedMavenProperty -->
                                        <argument>cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; </argument>
                                        <argument>./run_workflow.sh</argument>
                                    </arguments>
@ -512,6 +526,7 @@
                                <configuration>
                                    <executable>cat</executable>
                                    <arguments>
+                                        <!--suppress UnresolvedMavenProperty -->
                                        <argument>${oozie.execution.log.file.location}</argument>
                                    </arguments>
                                </configuration>
Author	SHA1	Message	Date
Sandro La Bruzzo	ffa8cdf981	fixed error on loading files on solr, in cluster is not possible to iterate files inside jar	2022-10-18 10:45:40 +02:00
Sandro La Bruzzo	818a936468	Merge remote-tracking branch 'origin/beta' into scholix_to_solr	2022-10-11 10:58:53 +02:00
Sandro La Bruzzo	4b8739e45b	- Implemented oozie workflows and Java Classes to feed into solr index	2022-10-11 10:58:17 +02:00
Sandro La Bruzzo	7784b3d9c4	Merge remote-tracking branch 'origin/beta' into scholix_to_solr	2022-10-06 09:25:58 +02:00
Sandro La Bruzzo	6d5cda1a03	code refactor	2022-10-06 09:12:14 +02:00
Sandro La Bruzzo	bf6c8ccc79	- Implemented Mapping from Scholix to Solr dataModel - Moved date normalize cleaning from Saxon Function to GraphCleaningFunctions - added Scholix records to test feeding	2022-10-06 08:49:20 +02:00
Sandro La Bruzzo	56f880c89d	Added functionality to create index collection inside dhp-graph provision	2022-10-03 15:53:03 +02:00