diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala
index 578db1ea9..85fbd99c4 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala
@@ -66,7 +66,7 @@ object SparkDownloadEBILinks {
val log: Logger = LoggerFactory.getLogger(getClass)
val MAX_ITEM_PER_PARTITION = 20000
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_download_update.json")))
+ val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
@@ -86,7 +86,7 @@ object SparkDownloadEBILinks {
val workingPath = parser.get("workingPath")
log.info(s"workingPath -> $workingPath")
- log.info("Getting max pubmedId where the links have been requested")
+ log.info("Getting max pubmedId where the links have already requested")
val links: Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
val lastPMIDRequested = links.map(l => l.id).select(max("value")).first.getLong(0)
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala
index 0db469769..10467884c 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala
@@ -14,7 +14,7 @@ object SparkEBILinksToOaf {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_to_df_params.json")))
+ val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
@@ -31,7 +31,7 @@ object SparkEBILinksToOaf {
log.info(s"targetPath -> $targetPath")
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
- val ebLinks: Dataset[EBILinkItem] = spark.read.load(s"${sourcePath}_dataset").as[EBILinkItem].filter(l => l.links != null)
+ val ebLinks: Dataset[EBILinkItem] = spark.read.load(sourcePath).as[EBILinkItem].filter(l => l.links != null && l.links.startsWith("{"))
ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/config-default.xml
similarity index 100%
rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/config-default.xml
rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/config-default.xml
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml
new file mode 100644
index 000000000..73b1a3b60
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml
@@ -0,0 +1,105 @@
+
+
+
+ sourcePath
+ the Working Path
+
+
+ workingPath
+ the Working Path
+
+
+ targetPath
+ the OAF MDStore Path
+
+
+ sparkDriverMemory
+ memory for driver process
+
+
+ sparkExecutorMemory
+ memory for individual executor
+
+
+ sparkExecutorCores
+ number of cores used by single executor
+
+
+ resumeFrom
+ DownloadEBILinks
+ node to start
+
+
+
+
+
+
+
+ ${wf:conf('resumeFrom') eq 'DownloadEBILinks'}
+ ${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+ yarn-cluster
+ cluster
+ Incremental Download EBI Links
+ eu.dnetllib.dhp.sx.bio.ebi.SparkDownloadEBILinks
+ dhp-aggregation-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.shuffle.partitions=2000
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${sourcePath}
+ --workingPath${workingPath}
+ --masteryarn
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ yarn-cluster
+ cluster
+ Create OAF DataSet
+ eu.dnetllib.dhp.sx.bio.ebi.SparkEBILinksToOaf
+ dhp-aggregation-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.sql.shuffle.partitions=2000
+ ${sparkExtraOPT}
+
+ --sourcePath${sourcePath}/ebi_links_dataset
+ --targetPath${targetPath}
+ --masteryarn
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/workflow.xml
deleted file mode 100644
index 7612321c0..000000000
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/workflow.xml
+++ /dev/null
@@ -1,99 +0,0 @@
-
-
-
- workingPath
- the Working Path
-
-
- sparkDriverMemory
- memory for driver process
-
-
- sparkExecutorMemory
- memory for individual executor
-
-
- sparkExecutorCores
- number of cores used by single executor
-
-
-
-
-
-
-
- Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
-
-
-
-
- yarn-cluster
- cluster
- Create Baselnie DataSet
-
- eu.dnetlib.dhp.sx.ebi.SparkCreateBaselineDataFrame
- dhp-graph-mapper-${projectVersion}.jar
-
- --executor-memory=${sparkExecutorMemory}
- --executor-cores=1
- --driver-memory=${sparkDriverMemory}
- --executor-cores=${sparkExecutorCores}
- ${sparkExtraOPT}
-
- --workingPath${workingPath}
- --masteryarn
- --hdfsServerUri${nameNode}
-
-
-
-
-
-
-
- yarn-cluster
- cluster
- Create EBI DataSet
-
- eu.dnetlib.dhp.sx.ebi.SparkEBILinksToOaf
- dhp-graph-mapper-${projectVersion}.jar
-
- --executor-memory=${sparkExecutorMemory}
- --executor-cores=${sparkExecutorCores}
- --driver-memory=${sparkDriverMemory}
- --conf spark.sql.shuffle.partitions=1000
- ${sparkExtraOPT}
-
- --workingPath${workingPath}
- --masteryarn
-
-
-
-
-
-
-
- yarn-cluster
- cluster
- Create Baseline DataSet
-
- eu.dnetlib.dhp.sx.ebi.SparkAddLinkUpdates
- dhp-graph-mapper-${projectVersion}.jar
-
- --executor-memory=${sparkExecutorMemory}
- --executor-cores=1
- --driver-memory=${sparkDriverMemory}
- --executor-cores=${sparkExecutorCores}
- ${sparkExtraOPT}
-
- --workingPath${workingPath}
- --masteryarn
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/config-default.xml
deleted file mode 100644
index 17cd6c9a3..000000000
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/config-default.xml
+++ /dev/null
@@ -1,68 +0,0 @@
-
-
-
-
-
- jobTracker
- yarnRM
-
-
- nameNode
- hdfs://nameservice1
-
-
- hive_metastore_uris
- thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
-
-
- spark2YarnHistoryServerAddress
- http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- oozie.launcher.mapreduce.user.classpath.first
- true
-
-
-
- oozie.use.system.libpath
- true
-
-
- oozie.action.sharelib.for.spark
- spark2
-
-
- spark2EventLogDir
- /user/spark/spark2ApplicationHistory
-
-
- spark2ExtraListeners
- "com.cloudera.spark.lineage.NavigatorAppListener"
-
-
- spark2SqlQueryExecutionListeners
- "com.cloudera.spark.lineage.NavigatorQueryListener"
-
-
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/workflow.xml
deleted file mode 100644
index cd3bb8c71..000000000
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/workflow.xml
+++ /dev/null
@@ -1,67 +0,0 @@
-
-
-
- sourcePath
- the Working Path
-
-
- workingPath
- the Working Path
-
-
- sparkDriverMemory
- memory for driver process
-
-
- sparkExecutorMemory
- memory for individual executor
-
-
- sparkExecutorCores
- number of cores used by single executor
-
-
-
-
-
-
-
- Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
-
-
-
-
-
- yarn-cluster
- cluster
- Incremental Download EBI Links
- eu.dnetlib.dhp.sx.graph.ebi.SparkDownloadEBILinks
- dhp-graph-mapper-${projectVersion}.jar
-
- --executor-memory=${sparkExecutorMemory}
- --executor-cores=${sparkExecutorCores}
- --driver-memory=${sparkDriverMemory}
- --conf spark.extraListeners=${spark2ExtraListeners}
- --conf spark.sql.shuffle.partitions=2000
- --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
- --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
- --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-
- --sourcePath${sourcePath}
- --workingPath${workingPath}
- --masteryarn
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigExploreTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigExploreTest.java
new file mode 100644
index 000000000..9bc2924c3
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigExploreTest.java
@@ -0,0 +1,126 @@
+
+package eu.dnetlib.dhp.oa.provision;
+
+import java.io.IOException;
+import java.net.URI;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.*;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.junit.jupiter.MockitoExtension;
+
+import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
+import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+
+@ExtendWith(MockitoExtension.class)
+public class SolrConfigExploreTest extends SolrExploreTest {
+
+ protected static SparkSession spark;
+
+ private static final Integer batchSize = 100;
+
+ @Mock
+ private ISLookUpService isLookUpService;
+
+ @Mock
+ private ISLookupClient isLookupClient;
+
+ @BeforeEach
+ public void prepareMocks() throws ISLookUpException, IOException {
+ isLookupClient.setIsLookup(isLookUpService);
+
+ int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort();
+
+ Mockito
+ .when(isLookupClient.getDsId(Mockito.anyString()))
+ .thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
+ Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort));
+ Mockito
+ .when(isLookupClient.getLayoutSource(Mockito.anyString()))
+ .thenReturn(IOUtils.toString(getClass().getResourceAsStream("fields.xml")));
+ Mockito
+ .when(isLookupClient.getLayoutTransformer())
+ .thenReturn(IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl")));
+ }
+
+ @BeforeAll
+ public static void before() {
+
+ SparkConf conf = new SparkConf();
+ conf.setAppName(XmlIndexingJobTest.class.getSimpleName());
+ conf.registerKryoClasses(new Class[] {
+ SerializableSolrInputDocument.class
+ });
+
+ conf.setMaster("local[1]");
+ conf.set("spark.driver.host", "localhost");
+ conf.set("hive.metastore.local", "true");
+ conf.set("spark.ui.enabled", "false");
+ conf.set("spark.sql.warehouse.dir", workingDir.resolve("spark").toString());
+
+ spark = SparkSession
+ .builder()
+ .appName(XmlIndexingJobTest.class.getSimpleName())
+ .config(conf)
+ .getOrCreate();
+
+
+ }
+
+ @AfterAll
+ public static void tearDown() {
+ spark.stop();
+ }
+
+ @Test
+ public void testSolrConfig() throws Exception {
+
+ String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml";
+
+ new XmlIndexingJob(spark, inputPath, FORMAT, batchSize, XmlIndexingJob.OutputFormat.SOLR, null).run(isLookupClient);
+ Assertions.assertEquals(0, miniCluster.getSolrClient().commit().getStatus());
+
+ String[] queryStrings = {
+ "cancer",
+ "graph",
+ "graphs"
+ };
+
+ for (String q : queryStrings) {
+ SolrQuery query = new SolrQuery();
+ query.setRequestHandler("/exploreSearch");
+ query.add(CommonParams.Q, q);
+ query.set("debugQuery", "on");
+
+ log.info("Submit query to Solr with params: {}", query.toString());
+ QueryResponse rsp = miniCluster.getSolrClient().query(query);
+// System.out.println(rsp.getHighlighting());
+// System.out.println(rsp.getExplainMap());
+
+ for (SolrDocument doc : rsp.getResults()) {
+ System.out.println(
+ doc.get("score") + "\t" +
+ doc.get("__indexrecordidentifier") + "\t" +
+ doc.get("resultidentifier") + "\t" +
+ doc.get("resultauthor") + "\t" +
+ doc.get("resultacceptanceyear") + "\t" +
+ doc.get("resultsubject") + "\t" +
+ doc.get("resulttitle") + "\t" +
+ doc.get("relprojectname") + "\t" +
+ doc.get("resultdescription") + "\t" +
+ doc.get("__all") + "\t"
+ );
+ }
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java
new file mode 100644
index 000000000..e20ecf152
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrConfigTest.java
@@ -0,0 +1,131 @@
+
+package eu.dnetlib.dhp.oa.provision;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.net.URI;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrInputField;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.dom4j.io.SAXReader;
+import org.junit.jupiter.api.*;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.junit.jupiter.MockitoExtension;
+
+import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
+import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+
+@ExtendWith(MockitoExtension.class)
+public class SolrConfigTest extends SolrTest {
+
+ protected static SparkSession spark;
+
+ private static final Integer batchSize = 100;
+
+ @Mock
+ private ISLookUpService isLookUpService;
+
+ @Mock
+ private ISLookupClient isLookupClient;
+
+ @BeforeEach
+ public void prepareMocks() throws ISLookUpException, IOException {
+ isLookupClient.setIsLookup(isLookUpService);
+
+ int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort();
+
+ Mockito
+ .when(isLookupClient.getDsId(Mockito.anyString()))
+ .thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
+ Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort));
+ Mockito
+ .when(isLookupClient.getLayoutSource(Mockito.anyString()))
+ .thenReturn(IOUtils.toString(getClass().getResourceAsStream("fields.xml")));
+ Mockito
+ .when(isLookupClient.getLayoutTransformer())
+ .thenReturn(IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl")));
+ }
+
+ @BeforeAll
+ public static void before() {
+
+ SparkConf conf = new SparkConf();
+ conf.setAppName(XmlIndexingJobTest.class.getSimpleName());
+ conf.registerKryoClasses(new Class[] {
+ SerializableSolrInputDocument.class
+ });
+
+ conf.setMaster("local[1]");
+ conf.set("spark.driver.host", "localhost");
+ conf.set("hive.metastore.local", "true");
+ conf.set("spark.ui.enabled", "false");
+ conf.set("spark.sql.warehouse.dir", workingDir.resolve("spark").toString());
+
+ spark = SparkSession
+ .builder()
+ .appName(XmlIndexingJobTest.class.getSimpleName())
+ .config(conf)
+ .getOrCreate();
+
+
+ }
+
+ @AfterAll
+ public static void tearDown() {
+ spark.stop();
+ }
+
+ @Test
+ public void testSolrConfig() throws Exception {
+
+ String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml";
+
+ new XmlIndexingJob(spark, inputPath, FORMAT, batchSize, XmlIndexingJob.OutputFormat.SOLR, null).run(isLookupClient);
+ Assertions.assertEquals(0, miniCluster.getSolrClient().commit().getStatus());
+
+ String[] queryStrings = {
+ "cancer",
+ "graph",
+ "graphs"
+ };
+
+ for (String q : queryStrings) {
+ SolrQuery query = new SolrQuery();
+ query.add(CommonParams.Q, q);
+
+ log.info("Submit query to Solr with params: {}", query.toString());
+ QueryResponse rsp = miniCluster.getSolrClient().query(query);
+
+ for (SolrDocument doc : rsp.getResults()) {
+ System.out.println(
+ doc.get("score") + "\t" +
+ doc.get("__indexrecordidentifier") + "\t" +
+ doc.get("resultidentifier") + "\t" +
+ doc.get("resultauthor") + "\t" +
+ doc.get("resultacceptanceyear") + "\t" +
+ doc.get("resultsubject") + "\t" +
+ doc.get("resulttitle") + "\t" +
+ doc.get("relprojectname") + "\t" +
+ doc.get("resultdescription") + "\t" +
+ doc.get("__all") + "\t"
+ );
+ }
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrExploreTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrExploreTest.java
new file mode 100644
index 000000000..b86fd8ac8
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrExploreTest.java
@@ -0,0 +1,109 @@
+
+package eu.dnetlib.dhp.oa.provision;
+
+import java.io.File;
+import java.nio.file.Path;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.solr.client.solrj.embedded.JettyConfig;
+import org.apache.solr.client.solrj.impl.CloudSolrClient;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.ConfigSetAdminRequest;
+import org.apache.solr.client.solrj.request.QueryRequest;
+import org.apache.solr.cloud.MiniSolrCloudCluster;
+import org.apache.solr.common.params.CollectionParams;
+import org.apache.solr.common.params.CoreAdminParams;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.util.NamedList;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.io.TempDir;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public abstract class SolrExploreTest {
+
+ protected static final Logger log = LoggerFactory.getLogger(SolrTest.class);
+
+ protected static final String FORMAT = "test";
+ protected static final String DEFAULT_COLLECTION = FORMAT + "-index-openaire";
+ protected static final String CONFIG_NAME = "testConfig";
+
+ protected static MiniSolrCloudCluster miniCluster;
+
+ @TempDir
+ public static Path workingDir;
+
+ @BeforeAll
+ public static void setup() throws Exception {
+
+ // random unassigned HTTP port
+ final int jettyPort = 0;
+ final JettyConfig jettyConfig = JettyConfig.builder().setPort(jettyPort).build();
+
+ log.info(String.format("working directory: %s", workingDir.toString()));
+ System.setProperty("solr.log.dir", workingDir.resolve("logs").toString());
+
+ // create a MiniSolrCloudCluster instance
+ miniCluster = new MiniSolrCloudCluster(2, workingDir.resolve("solr"), jettyConfig);
+
+ // Upload Solr configuration directory to ZooKeeper
+ String solrZKConfigDir = "src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/exploreTestConfig";
+ File configDir = new File(solrZKConfigDir);
+
+ miniCluster.uploadConfigSet(configDir.toPath(), CONFIG_NAME);
+
+ // override settings in the solrconfig include
+ System.setProperty("solr.tests.maxBufferedDocs", "100000");
+ System.setProperty("solr.tests.maxIndexingThreads", "-1");
+ System.setProperty("solr.tests.ramBufferSizeMB", "100");
+
+ // use non-test classes so RandomizedRunner isn't necessary
+ System.setProperty("solr.tests.mergeScheduler", "org.apache.lucene.index.ConcurrentMergeScheduler");
+ System.setProperty("solr.directoryFactory", "solr.RAMDirectoryFactory");
+ System.setProperty("solr.lock.type", "single");
+
+ log.info(new ConfigSetAdminRequest.List().process(miniCluster.getSolrClient()).toString());
+ log
+ .info(
+ CollectionAdminRequest.ClusterStatus
+ .getClusterStatus()
+ .process(miniCluster.getSolrClient())
+ .toString());
+
+ NamedList