From ffa8cdf9810e3688fa8cafa94f0c87a7aff33f33 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 18 Oct 2022 10:45:40 +0200 Subject: [PATCH] fixed error on loading files on solr, in cluster is not possible to iterate files inside jar --- .../oa/provision/SolrAdminApplication.java | 1 + .../eu/dnetlib/dhp/oa/provision/SolrUtil.java | 37 ++-- .../provision/SparkIndexCollectionOnSOLR.java | 19 +- .../dhp/oa/provision/conf/files/file_list | 6 + .../sx/provision/index_solr_parameters.json | 2 +- .../sx/solr/provision/oozie_app/workflow.xml | 202 +++++++++--------- .../dhp/oa/provision/ScholixIndexingTest.java | 3 +- 7 files changed, 141 insertions(+), 129 deletions(-) create mode 100644 dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/file_list diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java index d4b256b66..b3a261127 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java @@ -94,6 +94,7 @@ public class SolrAdminApplication implements Closeable { SolrUtil .uploadZookeperConfig(this.solrClient.getZkStateReader().getZkClient(), collection, true, fields); SolrUtil.createCollection(this.solrClient, collection, 48, 1, 12, collection); + return null; default: throw new IllegalArgumentException("action not managed: " + action); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrUtil.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrUtil.java index e75d1ac38..6efede998 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrUtil.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrUtil.java @@ -12,6 +12,7 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.Arrays; import java.util.HashMap; import java.util.Map; import java.util.Objects; @@ -56,9 +57,11 @@ public class SolrUtil { private static final char DELIMITER = '$'; - private static final String CONF_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf"; + public static final String CONF_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/"; - public static final String CONF_FILE_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/files/"; + // public static final String CONF_FILE_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/files/"; + + public static final String LIST_FILE_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/files/file_list"; private static final String SCHEMA_TEMPLATE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/schemaTemplate.xslt"; @@ -155,6 +158,7 @@ public class SolrUtil { } private static String loadFileInClassPath(final String aPath) { + System.out.println("LOAD FILE FROM PATH: " + aPath); try { return IOUtils .toString(Objects.requireNonNull(SolrUtil.class.getResourceAsStream(aPath)), Charset.defaultCharset()); @@ -164,7 +168,7 @@ public class SolrUtil { } public static Map getServiceProperties() throws IOException { - final String properties = loadFileInClassPath(CONF_BASE_PATH + "/service_properties.json"); + final String properties = loadFileInClassPath(CONF_BASE_PATH + "service_properties.json"); final ObjectMapper mapper = new ObjectMapper(); TypeFactory typeFactory = mapper.getTypeFactory(); MapType mapType = typeFactory.constructMapType(HashMap.class, String.class, String.class); @@ -173,7 +177,7 @@ public class SolrUtil { public static String getConfig() throws Exception { final Map p = getServiceProperties(); - final String st = loadFileInClassPath(CONF_BASE_PATH + "/solrconfig.xml.st"); + final String st = loadFileInClassPath(CONF_BASE_PATH + "solrconfig.xml.st"); final ST solrConfig = new ST(st, DELIMITER, DELIMITER); p.forEach(solrConfig::add); return solrConfig.render(); @@ -204,22 +208,15 @@ public class SolrUtil { res.put("solrconfig.xml", getConfig().getBytes()); log.debug("adding solrconfig.xml to the resource map"); - - Files - .list( - Paths.get(Objects.requireNonNull(SolrUtil.class.getResource(CONF_FILE_BASE_PATH)).getPath())) - .map(Path::getFileName) - .forEach(s -> { - log.debug(String.format("put file from path %s", CONF_FILE_BASE_PATH + s)); - res - .put( - String.valueOf(s), - - Objects - .requireNonNull(loadFileInClassPath(CONF_FILE_BASE_PATH + s)) - .getBytes(StandardCharsets.UTF_8)); - }); - + String data = IOUtils + .toString(Objects.requireNonNull(SolrUtil.class.getResourceAsStream(LIST_FILE_BASE_PATH))); + Arrays.stream(data.split("\n")).forEach(s -> { + final String name = s.replace(CONF_BASE_PATH + "files/", ""); + res + .put( + name, + Objects.requireNonNull(loadFileInClassPath(s)).getBytes(StandardCharsets.UTF_8)); + }); return res; } catch (Throwable e) { throw new Exception("failed to build configuration", e); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkIndexCollectionOnSOLR.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkIndexCollectionOnSOLR.java index 712178719..2eb461e47 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkIndexCollectionOnSOLR.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkIndexCollectionOnSOLR.java @@ -7,6 +7,7 @@ import java.io.IOException; import java.util.Objects; import java.util.Optional; +import org.apache.commons.cli.ParseException; import org.apache.commons.io.IOUtils; import org.apache.solr.common.SolrInputDocument; import org.apache.spark.SparkConf; @@ -21,6 +22,7 @@ import org.slf4j.LoggerFactory; import com.lucidworks.spark.util.SolrSupport; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.provision.ProvisionConstants; import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument; import eu.dnetlib.dhp.oa.provision.scholix.ScholixToSolr; import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient; @@ -33,7 +35,7 @@ public class SparkIndexCollectionOnSOLR { // LOGGER initialized private static final Logger log = LoggerFactory.getLogger(SparkIndexCollectionOnSOLR.class); - public static void main(String[] args) throws IOException { + public static void main(String[] args) throws IOException, ParseException { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( @@ -42,14 +44,16 @@ public class SparkIndexCollectionOnSOLR { SparkIndexCollectionOnSOLR.class .getResourceAsStream("/eu/dnetlib/dhp/sx/provision/index_solr_parameters.json")))); + parser.parseArgument(args); + final String cluster = parser.get("cluster"); log.info("Cluster is {}", cluster); final String format = parser.get("format"); log.info("Index format name is {}", format); - final String isLookupUrl = parser.get("isLookupUrl"); - log.info("isLookupUrl is {}", isLookupUrl); + final String isLookupUrl = parser.get("isURL"); + log.info("isURL is {}", isLookupUrl); final String inputPath = parser.get("inputPath"); log.info("inputPath: {}", inputPath); @@ -75,11 +79,12 @@ public class SparkIndexCollectionOnSOLR { conf, isSparkSessionManaged, spark -> { - final ISLookupClient isLookupClient = new ISLookupClient( - ISLookupClientFactory.getLookUpService(isLookupUrl)); - final String zkHost = isLookupClient.getZkHost(); + final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl)); + final String zkHost = isLookup.getZkHost(); log.info("zkHost: {}", zkHost); - feedScholixToSOLRIndex(spark, inputPath, format, batchSize, zkHost); + final String collection = ProvisionConstants.getCollectionName(format); + log.info("collection: {}", collection); + feedScholixToSOLRIndex(spark, inputPath, collection, batchSize, zkHost); }); } diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/file_list b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/file_list new file mode 100644 index 000000000..7b0915088 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/conf/files/file_list @@ -0,0 +1,6 @@ +/eu/dnetlib/dhp/oa/provision/conf/files/currency.xml +/eu/dnetlib/dhp/oa/provision/conf/files/elevate.xml +/eu/dnetlib/dhp/oa/provision/conf/files/params.json +/eu/dnetlib/dhp/oa/provision/conf/files/protwords.txt +/eu/dnetlib/dhp/oa/provision/conf/files/stopwords.txt +/eu/dnetlib/dhp/oa/provision/conf/files/synonyms.txt \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/provision/index_solr_parameters.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/provision/index_solr_parameters.json index 80987de23..c72dc17a4 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/provision/index_solr_parameters.json +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/provision/index_solr_parameters.json @@ -7,7 +7,7 @@ }, { "paramName":"is", - "paramLongName":"isLookupUrl", + "paramLongName":"isURL", "paramDescription":"the Information Service LookUp URL", "paramRequired":true }, diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/solr/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/solr/provision/oozie_app/workflow.xml index 2d46f9f34..0753c35ab 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/solr/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/solr/provision/oozie_app/workflow.xml @@ -1,111 +1,113 @@ - - - - sourcePath - the sourcePath of the json RDDs - - - isLookupUrl - URL for the isLookup service - - - solrDeletionQuery - *:* - query used in the deleted by query operation - - - format - metadata format name (SMF) - + + + + sourcePath + the sourcePath of the json RDDs + + + isLookupUrl + URL for the isLookup service + + + solrDeletionQuery + *:* + query used in the deleted by query operation + + + format + metadata format name (SMF) + - + - + - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + - - - - - oozie.launcher.mapreduce.user.classpath.first - true - - - eu.dnetlib.dhp.oa.provision.SolrAdminApplication - --isLookupUrl${isLookupUrl} - --format${format} - --actionDELETE_BY_QUERY - --query${solrDeletionQuery} - --committrue - - - - + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.oa.provision.SolrAdminApplication + --isLookupUrl${isLookupUrl} + --format${format} + --actionDELETE_BY_QUERY + --query${solrDeletionQuery} + --committrue + + + + - - - - - oozie.launcher.mapreduce.user.classpath.first - true - - - eu.dnetlib.dhp.oa.provision.SolrAdminApplication - --isLookupUrl${isLookupUrl} - --format${format} - --actionCREATE + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.oa.provision.SolrAdminApplication + --isLookupUrl${isLookupUrl} + --format${format} + --actionCREATE - - - - + + + + - - - yarn - cluster - Index summary - eu.dnetlib.dhp.sx.provision.SparkIndexCollectionOnSOLR - dhp-graph-provision-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --conf spark.dynamicAllocation.maxExecutors="8" - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --clusteryarn - --isLookupUrl${isLookupUrl} - --inputPath${sourcePath} - --format${format} + + + yarn + cluster + Index summary + eu.dnetlib.dhp.sx.provision.SparkIndexCollectionOnSOLR + dhp-graph-provision-${projectVersion}.jar + + --conf spark.dynamicAllocation.enabled=true + --conf spark.shuffle.service.enabled=true + --executor-memory=${sparkExecutorMemory} + --conf spark.dynamicAllocation.maxExecutors="16" + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --clusteryarn + --isURL${isLookupUrl} + --inputPath${sourcePath} + --format${format} - - - - + + + + - - - - - oozie.launcher.mapreduce.user.classpath.first - true - - - eu.dnetlib.dhp.oa.provision.SolrAdminApplication - --isLookupUrl${isLookupUrl} - --format${format} - --actionCOMMIT - - - - + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.oa.provision.SolrAdminApplication + --isLookupUrl${isLookupUrl} + --format${format} + --actionCOMMIT + + + + - - \ No newline at end of file + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/ScholixIndexingTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/ScholixIndexingTest.java index 45a3642f3..38b1ea3d2 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/ScholixIndexingTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/ScholixIndexingTest.java @@ -97,8 +97,9 @@ public class ScholixIndexingTest extends SolrTest { .list( Paths .get( - Objects.requireNonNull(getClass().getResource(SolrUtil.CONF_FILE_BASE_PATH)).getPath())) + Objects.requireNonNull(getClass().getResource(SolrUtil.CONF_BASE_PATH + "files/")).getPath())) .map(Path::getFileName) + .filter(p -> !p.getFileName().toString().equalsIgnoreCase("file_list")) .map(Path::toString) .collect(Collectors.toList()); configurationFiles.add("schema.xml");