fixed error on loading files on solr, in cluster is not possible to iterate files inside jar

This commit is contained in:
Sandro La Bruzzo 2022-10-18 10:45:40 +02:00
parent 818a936468
commit ffa8cdf981
7 changed files with 141 additions and 129 deletions

View File

@ -94,6 +94,7 @@ public class SolrAdminApplication implements Closeable {
SolrUtil
.uploadZookeperConfig(this.solrClient.getZkStateReader().getZkClient(), collection, true, fields);
SolrUtil.createCollection(this.solrClient, collection, 48, 1, 12, collection);
return null;
default:
throw new IllegalArgumentException("action not managed: " + action);

View File

@ -12,6 +12,7 @@ import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
@ -56,9 +57,11 @@ public class SolrUtil {
private static final char DELIMITER = '$';
private static final String CONF_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf";
public static final String CONF_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/";
public static final String CONF_FILE_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/files/";
// public static final String CONF_FILE_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/files/";
public static final String LIST_FILE_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/files/file_list";
private static final String SCHEMA_TEMPLATE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/schemaTemplate.xslt";
@ -155,6 +158,7 @@ public class SolrUtil {
}
private static String loadFileInClassPath(final String aPath) {
System.out.println("LOAD FILE FROM PATH: " + aPath);
try {
return IOUtils
.toString(Objects.requireNonNull(SolrUtil.class.getResourceAsStream(aPath)), Charset.defaultCharset());
@ -164,7 +168,7 @@ public class SolrUtil {
}
public static Map<String, String> getServiceProperties() throws IOException {
final String properties = loadFileInClassPath(CONF_BASE_PATH + "/service_properties.json");
final String properties = loadFileInClassPath(CONF_BASE_PATH + "service_properties.json");
final ObjectMapper mapper = new ObjectMapper();
TypeFactory typeFactory = mapper.getTypeFactory();
MapType mapType = typeFactory.constructMapType(HashMap.class, String.class, String.class);
@ -173,7 +177,7 @@ public class SolrUtil {
public static String getConfig() throws Exception {
final Map<String, String> p = getServiceProperties();
final String st = loadFileInClassPath(CONF_BASE_PATH + "/solrconfig.xml.st");
final String st = loadFileInClassPath(CONF_BASE_PATH + "solrconfig.xml.st");
final ST solrConfig = new ST(st, DELIMITER, DELIMITER);
p.forEach(solrConfig::add);
return solrConfig.render();
@ -204,22 +208,15 @@ public class SolrUtil {
res.put("solrconfig.xml", getConfig().getBytes());
log.debug("adding solrconfig.xml to the resource map");
Files
.list(
Paths.get(Objects.requireNonNull(SolrUtil.class.getResource(CONF_FILE_BASE_PATH)).getPath()))
.map(Path::getFileName)
.forEach(s -> {
log.debug(String.format("put file from path %s", CONF_FILE_BASE_PATH + s));
res
.put(
String.valueOf(s),
Objects
.requireNonNull(loadFileInClassPath(CONF_FILE_BASE_PATH + s))
.getBytes(StandardCharsets.UTF_8));
});
String data = IOUtils
.toString(Objects.requireNonNull(SolrUtil.class.getResourceAsStream(LIST_FILE_BASE_PATH)));
Arrays.stream(data.split("\n")).forEach(s -> {
final String name = s.replace(CONF_BASE_PATH + "files/", "");
res
.put(
name,
Objects.requireNonNull(loadFileInClassPath(s)).getBytes(StandardCharsets.UTF_8));
});
return res;
} catch (Throwable e) {
throw new Exception("failed to build configuration", e);

View File

@ -7,6 +7,7 @@ import java.io.IOException;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.SolrInputDocument;
import org.apache.spark.SparkConf;
@ -21,6 +22,7 @@ import org.slf4j.LoggerFactory;
import com.lucidworks.spark.util.SolrSupport;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.provision.ProvisionConstants;
import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
import eu.dnetlib.dhp.oa.provision.scholix.ScholixToSolr;
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
@ -33,7 +35,7 @@ public class SparkIndexCollectionOnSOLR {
// LOGGER initialized
private static final Logger log = LoggerFactory.getLogger(SparkIndexCollectionOnSOLR.class);
public static void main(String[] args) throws IOException {
public static void main(String[] args) throws IOException, ParseException {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
@ -42,14 +44,16 @@ public class SparkIndexCollectionOnSOLR {
SparkIndexCollectionOnSOLR.class
.getResourceAsStream("/eu/dnetlib/dhp/sx/provision/index_solr_parameters.json"))));
parser.parseArgument(args);
final String cluster = parser.get("cluster");
log.info("Cluster is {}", cluster);
final String format = parser.get("format");
log.info("Index format name is {}", format);
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl is {}", isLookupUrl);
final String isLookupUrl = parser.get("isURL");
log.info("isURL is {}", isLookupUrl);
final String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
@ -75,11 +79,12 @@ public class SparkIndexCollectionOnSOLR {
conf,
isSparkSessionManaged,
spark -> {
final ISLookupClient isLookupClient = new ISLookupClient(
ISLookupClientFactory.getLookUpService(isLookupUrl));
final String zkHost = isLookupClient.getZkHost();
final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
final String zkHost = isLookup.getZkHost();
log.info("zkHost: {}", zkHost);
feedScholixToSOLRIndex(spark, inputPath, format, batchSize, zkHost);
final String collection = ProvisionConstants.getCollectionName(format);
log.info("collection: {}", collection);
feedScholixToSOLRIndex(spark, inputPath, collection, batchSize, zkHost);
});
}

View File

@ -0,0 +1,6 @@
/eu/dnetlib/dhp/oa/provision/conf/files/currency.xml
/eu/dnetlib/dhp/oa/provision/conf/files/elevate.xml
/eu/dnetlib/dhp/oa/provision/conf/files/params.json
/eu/dnetlib/dhp/oa/provision/conf/files/protwords.txt
/eu/dnetlib/dhp/oa/provision/conf/files/stopwords.txt
/eu/dnetlib/dhp/oa/provision/conf/files/synonyms.txt

View File

@ -7,7 +7,7 @@
},
{
"paramName":"is",
"paramLongName":"isLookupUrl",
"paramLongName":"isURL",
"paramDescription":"the Information Service LookUp URL",
"paramRequired":true
},

View File

@ -1,111 +1,113 @@
<workflow-app name="Index Scholexplorer Infospace" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the sourcePath of the json RDDs</description>
</property>
<property>
<name>isLookupUrl</name>
<description>URL for the isLookup service</description>
</property>
<property>
<name>solrDeletionQuery</name>
<value>*:*</value>
<description>query used in the deleted by query operation</description>
</property>
<property>
<name>format</name>
<description>metadata format name (SMF)</description>
</property>
<workflow-app name="Index Scholexplorer Infospace" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the sourcePath of the json RDDs</description>
</property>
<property>
<name>isLookupUrl</name>
<description>URL for the isLookup service</description>
</property>
<property>
<name>solrDeletionQuery</name>
<value>*:*</value>
<description>query used in the deleted by query operation</description>
</property>
<property>
<name>format</name>
<description>metadata format name (SMF)</description>
</property>
</parameters>
</parameters>
<start to="create_solr_index"/>
<start to="indexScholix"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="drop_solr_collection">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--format</arg><arg>${format}</arg>
<arg>--action</arg><arg>DELETE_BY_QUERY</arg>
<arg>--query</arg><arg>${solrDeletionQuery}</arg>
<arg>--commit</arg><arg>true</arg>
</java>
<ok to="create_solr_index"/>
<error to="Kill"/>
</action>
<action name="drop_solr_collection">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--format</arg><arg>${format}</arg>
<arg>--action</arg><arg>DELETE_BY_QUERY</arg>
<arg>--query</arg><arg>${solrDeletionQuery}</arg>
<arg>--commit</arg><arg>true</arg>
</java>
<ok to="create_solr_index"/>
<error to="Kill"/>
</action>
<action name="create_solr_index">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--format</arg><arg>${format}</arg>
<arg>--action</arg><arg>CREATE</arg>
<action name="create_solr_index">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--format</arg><arg>${format}</arg>
<arg>--action</arg><arg>CREATE</arg>
</java>
<ok to="indexScholix"/>
<error to="Kill"/>
</action>
</java>
<ok to="indexScholix"/>
<error to="Kill"/>
</action>
<action name="indexScholix">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Index summary</name>
<class>eu.dnetlib.dhp.sx.provision.SparkIndexCollectionOnSOLR</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--conf spark.dynamicAllocation.maxExecutors="8"
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--cluster</arg><arg>yarn</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--inputPath</arg><arg>${sourcePath}</arg>
<arg>--format</arg><arg>${format}</arg>
<action name="indexScholix">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Index summary</name>
<class>eu.dnetlib.dhp.sx.provision.SparkIndexCollectionOnSOLR</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.dynamicAllocation.enabled=true
--conf spark.shuffle.service.enabled=true
--executor-memory=${sparkExecutorMemory}
--conf spark.dynamicAllocation.maxExecutors="16"
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--cluster</arg><arg>yarn</arg>
<arg>--isURL</arg><arg>${isLookupUrl}</arg>
<arg>--inputPath</arg><arg>${sourcePath}</arg>
<arg>--format</arg><arg>${format}</arg>
</spark>
<ok to="commit_solr_collection"/>
<error to="Kill"/>
</action>
</spark>
<ok to="commit_solr_collection"/>
<error to="Kill"/>
</action>
<action name="commit_solr_collection">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--format</arg><arg>${format}</arg>
<arg>--action</arg><arg>COMMIT</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="commit_solr_collection">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--format</arg><arg>${format}</arg>
<arg>--action</arg><arg>COMMIT</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>
<end name="End"/>
</workflow-app>

View File

@ -97,8 +97,9 @@ public class ScholixIndexingTest extends SolrTest {
.list(
Paths
.get(
Objects.requireNonNull(getClass().getResource(SolrUtil.CONF_FILE_BASE_PATH)).getPath()))
Objects.requireNonNull(getClass().getResource(SolrUtil.CONF_BASE_PATH + "files/")).getPath()))
.map(Path::getFileName)
.filter(p -> !p.getFileName().toString().equalsIgnoreCase("file_list"))
.map(Path::toString)
.collect(Collectors.toList());
configurationFiles.add("schema.xml");