whitelist of topics

Michele Artini 3 years ago
parent 38da1c282a
commit 399548f221

@ -59,6 +59,9 @@ public class GenerateEventsJob {
final Set<String> dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist");
log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ","));
final Set<String> topicWhitelist = ClusterUtils.parseParamAsList(parser, "topicWhitelist");
log.info("topicWhitelist: {}", StringUtils.join(topicWhitelist, ","));
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
@ -75,7 +78,7 @@ public class GenerateEventsJob {
final Dataset<Event> dataset = groups
g -> EventFinder
.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, accumulators),
.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, topicWhitelist, accumulators),
.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));

@ -76,6 +76,7 @@ public class EventFinder {
final Set<String> dsIdWhitelist,
final Set<String> dsIdBlacklist,
final Set<String> dsTypeWhitelist,
final Set<String> topicWhitelist,
final Map<String, LongAccumulator> accumulators) {
final List<UpdateInfo<?>> list = new ArrayList<>();
@ -84,7 +85,13 @@ public class EventFinder {
for (final OaBrokerRelatedDatasource targetDs : target.getDatasources()) {
if (verifyTarget(targetDs, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) {
for (final UpdateMatcher<?> matcher : matchers) {
list.addAll(matcher.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators));
for (final UpdateInfo<?> info : matcher
.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators)) {
if (topicWhitelist == null || topicWhitelist.isEmpty()
|| topicWhitelist.contains(info.getTopic().getPath())) {

@ -24,6 +24,11 @@
<description>a black list (comma separeted, - for empty list) of datasource ids</description>
<description>a white list (comma separeted, * for all) of topics</description>
<description>the elasticsearch index name for events</description>
@ -447,6 +452,7 @@
<ok to="index_event_subset"/>
<error to="Kill"/>

@ -28,5 +28,11 @@
"paramLongName": "datasourceIdBlacklist",
"paramDescription": "a black list (comma separeted, - for empty list) of datasource ids",
"paramRequired": true
"paramName": "topicWhitelist",
"paramLongName": "topicWhitelist",
"paramDescription": "a white list (comma separeted, * for all) of topics",
"paramRequired": true

@ -0,0 +1,18 @@

@ -0,0 +1,116 @@
<workflow-app name="reindex_events" xmlns="uri:oozie:workflow:0.5">
<description>the path where the the generated data will be stored</description>
<description>the elasticsearch index name for events</description>
<description>the elasticsearch host</description>
<description>the max number of events for each couple (ds/topic)</description>
<description>the url of the broker service api</description>
<description>memory for driver process</description>
<description>memory for individual executor</description>
<description>number of cores used by single executor</description>
<description>oozie action sharelib for spark 2.*</description>
<description>spark 2.* extra listeners classname</description>
<description>spark 2.* sql query execution listeners classname</description>
<description>spark 2.* yarn history server address</description>
<description>spark 2.* event log dir location</description>
<start to="index_event_subset"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
<action name="index_event_subset">
<spark xmlns="uri:oozie:spark-action:0.2">
--conf spark.dynamicAllocation.maxExecutors="8"
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
<ok to="End"/>
<error to="Kill"/>
<end name="End"/>