forked from D-Net/dnet-hadoop
fixed error on loading files on solr, in cluster is not possible to iterate files inside jar
This commit is contained in:
parent
818a936468
commit
ffa8cdf981
|
@ -94,6 +94,7 @@ public class SolrAdminApplication implements Closeable {
|
|||
SolrUtil
|
||||
.uploadZookeperConfig(this.solrClient.getZkStateReader().getZkClient(), collection, true, fields);
|
||||
SolrUtil.createCollection(this.solrClient, collection, 48, 1, 12, collection);
|
||||
return null;
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("action not managed: " + action);
|
||||
|
|
|
@ -12,6 +12,7 @@ import java.nio.charset.StandardCharsets;
|
|||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
@ -56,9 +57,11 @@ public class SolrUtil {
|
|||
|
||||
private static final char DELIMITER = '$';
|
||||
|
||||
private static final String CONF_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf";
|
||||
public static final String CONF_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/";
|
||||
|
||||
public static final String CONF_FILE_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/files/";
|
||||
// public static final String CONF_FILE_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/files/";
|
||||
|
||||
public static final String LIST_FILE_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/files/file_list";
|
||||
|
||||
private static final String SCHEMA_TEMPLATE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/schemaTemplate.xslt";
|
||||
|
||||
|
@ -155,6 +158,7 @@ public class SolrUtil {
|
|||
}
|
||||
|
||||
private static String loadFileInClassPath(final String aPath) {
|
||||
System.out.println("LOAD FILE FROM PATH: " + aPath);
|
||||
try {
|
||||
return IOUtils
|
||||
.toString(Objects.requireNonNull(SolrUtil.class.getResourceAsStream(aPath)), Charset.defaultCharset());
|
||||
|
@ -164,7 +168,7 @@ public class SolrUtil {
|
|||
}
|
||||
|
||||
public static Map<String, String> getServiceProperties() throws IOException {
|
||||
final String properties = loadFileInClassPath(CONF_BASE_PATH + "/service_properties.json");
|
||||
final String properties = loadFileInClassPath(CONF_BASE_PATH + "service_properties.json");
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
TypeFactory typeFactory = mapper.getTypeFactory();
|
||||
MapType mapType = typeFactory.constructMapType(HashMap.class, String.class, String.class);
|
||||
|
@ -173,7 +177,7 @@ public class SolrUtil {
|
|||
|
||||
public static String getConfig() throws Exception {
|
||||
final Map<String, String> p = getServiceProperties();
|
||||
final String st = loadFileInClassPath(CONF_BASE_PATH + "/solrconfig.xml.st");
|
||||
final String st = loadFileInClassPath(CONF_BASE_PATH + "solrconfig.xml.st");
|
||||
final ST solrConfig = new ST(st, DELIMITER, DELIMITER);
|
||||
p.forEach(solrConfig::add);
|
||||
return solrConfig.render();
|
||||
|
@ -204,22 +208,15 @@ public class SolrUtil {
|
|||
|
||||
res.put("solrconfig.xml", getConfig().getBytes());
|
||||
log.debug("adding solrconfig.xml to the resource map");
|
||||
|
||||
Files
|
||||
.list(
|
||||
Paths.get(Objects.requireNonNull(SolrUtil.class.getResource(CONF_FILE_BASE_PATH)).getPath()))
|
||||
.map(Path::getFileName)
|
||||
.forEach(s -> {
|
||||
log.debug(String.format("put file from path %s", CONF_FILE_BASE_PATH + s));
|
||||
res
|
||||
.put(
|
||||
String.valueOf(s),
|
||||
|
||||
Objects
|
||||
.requireNonNull(loadFileInClassPath(CONF_FILE_BASE_PATH + s))
|
||||
.getBytes(StandardCharsets.UTF_8));
|
||||
});
|
||||
|
||||
String data = IOUtils
|
||||
.toString(Objects.requireNonNull(SolrUtil.class.getResourceAsStream(LIST_FILE_BASE_PATH)));
|
||||
Arrays.stream(data.split("\n")).forEach(s -> {
|
||||
final String name = s.replace(CONF_BASE_PATH + "files/", "");
|
||||
res
|
||||
.put(
|
||||
name,
|
||||
Objects.requireNonNull(loadFileInClassPath(s)).getBytes(StandardCharsets.UTF_8));
|
||||
});
|
||||
return res;
|
||||
} catch (Throwable e) {
|
||||
throw new Exception("failed to build configuration", e);
|
||||
|
|
|
@ -7,6 +7,7 @@ import java.io.IOException;
|
|||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.cli.ParseException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -21,6 +22,7 @@ import org.slf4j.LoggerFactory;
|
|||
import com.lucidworks.spark.util.SolrSupport;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.provision.ProvisionConstants;
|
||||
import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
|
||||
import eu.dnetlib.dhp.oa.provision.scholix.ScholixToSolr;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
|
||||
|
@ -33,7 +35,7 @@ public class SparkIndexCollectionOnSOLR {
|
|||
// LOGGER initialized
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkIndexCollectionOnSOLR.class);
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
public static void main(String[] args) throws IOException, ParseException {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
|
@ -42,14 +44,16 @@ public class SparkIndexCollectionOnSOLR {
|
|||
SparkIndexCollectionOnSOLR.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/sx/provision/index_solr_parameters.json"))));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String cluster = parser.get("cluster");
|
||||
log.info("Cluster is {}", cluster);
|
||||
|
||||
final String format = parser.get("format");
|
||||
log.info("Index format name is {}", format);
|
||||
|
||||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
log.info("isLookupUrl is {}", isLookupUrl);
|
||||
final String isLookupUrl = parser.get("isURL");
|
||||
log.info("isURL is {}", isLookupUrl);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
@ -75,11 +79,12 @@ public class SparkIndexCollectionOnSOLR {
|
|||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
final ISLookupClient isLookupClient = new ISLookupClient(
|
||||
ISLookupClientFactory.getLookUpService(isLookupUrl));
|
||||
final String zkHost = isLookupClient.getZkHost();
|
||||
final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
|
||||
final String zkHost = isLookup.getZkHost();
|
||||
log.info("zkHost: {}", zkHost);
|
||||
feedScholixToSOLRIndex(spark, inputPath, format, batchSize, zkHost);
|
||||
final String collection = ProvisionConstants.getCollectionName(format);
|
||||
log.info("collection: {}", collection);
|
||||
feedScholixToSOLRIndex(spark, inputPath, collection, batchSize, zkHost);
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
/eu/dnetlib/dhp/oa/provision/conf/files/currency.xml
|
||||
/eu/dnetlib/dhp/oa/provision/conf/files/elevate.xml
|
||||
/eu/dnetlib/dhp/oa/provision/conf/files/params.json
|
||||
/eu/dnetlib/dhp/oa/provision/conf/files/protwords.txt
|
||||
/eu/dnetlib/dhp/oa/provision/conf/files/stopwords.txt
|
||||
/eu/dnetlib/dhp/oa/provision/conf/files/synonyms.txt
|
|
@ -7,7 +7,7 @@
|
|||
},
|
||||
{
|
||||
"paramName":"is",
|
||||
"paramLongName":"isLookupUrl",
|
||||
"paramLongName":"isURL",
|
||||
"paramDescription":"the Information Service LookUp URL",
|
||||
"paramRequired":true
|
||||
},
|
||||
|
|
|
@ -1,111 +1,113 @@
|
|||
<workflow-app name="Index Scholexplorer Infospace" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the sourcePath of the json RDDs</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>URL for the isLookup service</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>solrDeletionQuery</name>
|
||||
<value>*:*</value>
|
||||
<description>query used in the deleted by query operation</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>format</name>
|
||||
<description>metadata format name (SMF)</description>
|
||||
</property>
|
||||
<workflow-app name="Index Scholexplorer Infospace" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the sourcePath of the json RDDs</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>URL for the isLookup service</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>solrDeletionQuery</name>
|
||||
<value>*:*</value>
|
||||
<description>query used in the deleted by query operation</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>format</name>
|
||||
<description>metadata format name (SMF)</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
</parameters>
|
||||
|
||||
<start to="create_solr_index"/>
|
||||
<start to="indexScholix"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="drop_solr_collection">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
<arg>--action</arg><arg>DELETE_BY_QUERY</arg>
|
||||
<arg>--query</arg><arg>${solrDeletionQuery}</arg>
|
||||
<arg>--commit</arg><arg>true</arg>
|
||||
</java>
|
||||
<ok to="create_solr_index"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="drop_solr_collection">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
<arg>--action</arg><arg>DELETE_BY_QUERY</arg>
|
||||
<arg>--query</arg><arg>${solrDeletionQuery}</arg>
|
||||
<arg>--commit</arg><arg>true</arg>
|
||||
</java>
|
||||
<ok to="create_solr_index"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="create_solr_index">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
<arg>--action</arg><arg>CREATE</arg>
|
||||
<action name="create_solr_index">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
<arg>--action</arg><arg>CREATE</arg>
|
||||
|
||||
</java>
|
||||
<ok to="indexScholix"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
</java>
|
||||
<ok to="indexScholix"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="indexScholix">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Index summary</name>
|
||||
<class>eu.dnetlib.dhp.sx.provision.SparkIndexCollectionOnSOLR</class>
|
||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors="8"
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--cluster</arg><arg>yarn</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--inputPath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
<action name="indexScholix">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Index summary</name>
|
||||
<class>eu.dnetlib.dhp.sx.provision.SparkIndexCollectionOnSOLR</class>
|
||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--conf spark.dynamicAllocation.enabled=true
|
||||
--conf spark.shuffle.service.enabled=true
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors="16"
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--cluster</arg><arg>yarn</arg>
|
||||
<arg>--isURL</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--inputPath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="commit_solr_collection"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
</spark>
|
||||
<ok to="commit_solr_collection"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="commit_solr_collection">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
<arg>--action</arg><arg>COMMIT</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="commit_solr_collection">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
<arg>--action</arg><arg>COMMIT</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -97,8 +97,9 @@ public class ScholixIndexingTest extends SolrTest {
|
|||
.list(
|
||||
Paths
|
||||
.get(
|
||||
Objects.requireNonNull(getClass().getResource(SolrUtil.CONF_FILE_BASE_PATH)).getPath()))
|
||||
Objects.requireNonNull(getClass().getResource(SolrUtil.CONF_BASE_PATH + "files/")).getPath()))
|
||||
.map(Path::getFileName)
|
||||
.filter(p -> !p.getFileName().toString().equalsIgnoreCase("file_list"))
|
||||
.map(Path::toString)
|
||||
.collect(Collectors.toList());
|
||||
configurationFiles.add("schema.xml");
|
||||
|
|
Loading…
Reference in New Issue