forked from D-Net/dnet-hadoop
whitelist of topics
This commit is contained in:
parent
467aa77279
commit
3c9cbd19f3
|
@ -59,6 +59,9 @@ public class GenerateEventsJob {
|
||||||
final Set<String> dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist");
|
final Set<String> dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist");
|
||||||
log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ","));
|
log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ","));
|
||||||
|
|
||||||
|
final Set<String> topicWhitelist = ClusterUtils.parseParamAsList(parser, "topicWhitelist");
|
||||||
|
log.info("topicWhitelist: {}", StringUtils.join(topicWhitelist, ","));
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||||
|
@ -75,7 +78,7 @@ public class GenerateEventsJob {
|
||||||
final Dataset<Event> dataset = groups
|
final Dataset<Event> dataset = groups
|
||||||
.map(
|
.map(
|
||||||
g -> EventFinder
|
g -> EventFinder
|
||||||
.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, accumulators),
|
.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, topicWhitelist, accumulators),
|
||||||
Encoders
|
Encoders
|
||||||
.bean(EventGroup.class))
|
.bean(EventGroup.class))
|
||||||
.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));
|
.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));
|
||||||
|
|
|
@ -76,6 +76,7 @@ public class EventFinder {
|
||||||
final Set<String> dsIdWhitelist,
|
final Set<String> dsIdWhitelist,
|
||||||
final Set<String> dsIdBlacklist,
|
final Set<String> dsIdBlacklist,
|
||||||
final Set<String> dsTypeWhitelist,
|
final Set<String> dsTypeWhitelist,
|
||||||
|
final Set<String> topicWhitelist,
|
||||||
final Map<String, LongAccumulator> accumulators) {
|
final Map<String, LongAccumulator> accumulators) {
|
||||||
|
|
||||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||||
|
@ -84,7 +85,13 @@ public class EventFinder {
|
||||||
for (final OaBrokerRelatedDatasource targetDs : target.getDatasources()) {
|
for (final OaBrokerRelatedDatasource targetDs : target.getDatasources()) {
|
||||||
if (verifyTarget(targetDs, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) {
|
if (verifyTarget(targetDs, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) {
|
||||||
for (final UpdateMatcher<?> matcher : matchers) {
|
for (final UpdateMatcher<?> matcher : matchers) {
|
||||||
list.addAll(matcher.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators));
|
for (final UpdateInfo<?> info : matcher
|
||||||
|
.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators)) {
|
||||||
|
if (topicWhitelist == null || topicWhitelist.isEmpty()
|
||||||
|
|| topicWhitelist.contains(info.getTopic().getPath())) {
|
||||||
|
list.add(info);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,6 +24,11 @@
|
||||||
<value>-</value>
|
<value>-</value>
|
||||||
<description>a black list (comma separeted, - for empty list) of datasource ids</description>
|
<description>a black list (comma separeted, - for empty list) of datasource ids</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>topicWhitelist</name>
|
||||||
|
<value>*</value>
|
||||||
|
<description>a white list (comma separeted, * for all) of topics</description>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>esEventIndexName</name>
|
<name>esEventIndexName</name>
|
||||||
<description>the elasticsearch index name for events</description>
|
<description>the elasticsearch index name for events</description>
|
||||||
|
@ -447,6 +452,7 @@
|
||||||
<arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg>
|
<arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg>
|
||||||
<arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg>
|
<arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg>
|
||||||
<arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg>
|
<arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg>
|
||||||
|
<arg>--topicWhitelist</arg><arg>${topicWhitelist}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="index_event_subset"/>
|
<ok to="index_event_subset"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -28,5 +28,11 @@
|
||||||
"paramLongName": "datasourceIdBlacklist",
|
"paramLongName": "datasourceIdBlacklist",
|
||||||
"paramDescription": "a black list (comma separeted, - for empty list) of datasource ids",
|
"paramDescription": "a black list (comma separeted, - for empty list) of datasource ids",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "topicWhitelist",
|
||||||
|
"paramLongName": "topicWhitelist",
|
||||||
|
"paramDescription": "a white list (comma separeted, * for all) of topics",
|
||||||
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,116 @@
|
||||||
|
<workflow-app name="reindex_events" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>outputDir</name>
|
||||||
|
<description>the path where the the generated data will be stored</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esEventIndexName</name>
|
||||||
|
<description>the elasticsearch index name for events</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esIndexHost</name>
|
||||||
|
<description>the elasticsearch host</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>maxIndexedEventsForDsAndTopic</name>
|
||||||
|
<description>the max number of events for each couple (ds/topic)</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>brokerApiBaseUrl</name>
|
||||||
|
<description>the url of the broker service api</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<description>memory for driver process</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<description>memory for individual executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<description>number of cores used by single executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozieActionShareLibForSpark2</name>
|
||||||
|
<description>oozie action sharelib for spark 2.*</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||||
|
<description>spark 2.* extra listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||||
|
<description>spark 2.* sql query execution listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<description>spark 2.* yarn history server address</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<description>spark 2.* event log dir location</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>mapreduce.job.queuename</name>
|
||||||
|
<value>${queueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||||
|
<value>${oozieLauncherQueueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<start to="index_event_subset"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="index_event_subset">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>IndexEventSubsetOnESJob</name>
|
||||||
|
<class>eu.dnetlib.dhp.broker.oa.IndexEventSubsetJob</class>
|
||||||
|
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors="8"
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||||
|
<arg>--index</arg><arg>${esEventIndexName}</arg>
|
||||||
|
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||||
|
<arg>--maxEventsForTopic</arg><arg>${maxIndexedEventsForDsAndTopic}</arg>
|
||||||
|
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
|
||||||
|
</workflow-app>
|
Loading…
Reference in New Issue