From 335af1137b82f7a37ffa4e7a5ad597ea7b731091 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Tue, 17 Apr 2018 15:06:04 +0200 Subject: [PATCH 01/16] Kafka 1.1.0 for brokers, zoo not yet bumped --- kafka/50kafka.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kafka/50kafka.yml b/kafka/50kafka.yml index 99b05eca..4315264a 100644 --- a/kafka/50kafka.yml +++ b/kafka/50kafka.yml @@ -42,7 +42,7 @@ spec: mountPath: /etc/kafka containers: - name: broker - image: solsson/kafka:1.0.1@sha256:1a4689d49d6274ac59b9b740f51b0408e1c90a9b66d16ad114ee9f7193bab111 + image: solsson/kafka:1.1@sha256:ba863ca7dc28563930584e37f93d57c2cbf3f46b1c1fa104fe8af7bcc0c31df4 env: - name: KAFKA_LOG4J_OPTS value: -Dlog4j.configuration=file:/etc/kafka/log4j.properties From d973cb1a4ca8039d56b270a9309ce1693080017f Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Tue, 1 May 2018 11:37:36 +0200 Subject: [PATCH 02/16] Runs Zookeeper too from the 1.1.0 Kafka image --- zookeeper/50pzoo.yml | 2 +- zookeeper/51zoo.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/zookeeper/50pzoo.yml b/zookeeper/50pzoo.yml index 65fd344d..ca6debc6 100644 --- a/zookeeper/50pzoo.yml +++ b/zookeeper/50pzoo.yml @@ -33,7 +33,7 @@ spec: mountPath: /var/lib/zookeeper/data containers: - name: zookeeper - image: solsson/kafka:1.0.1@sha256:1a4689d49d6274ac59b9b740f51b0408e1c90a9b66d16ad114ee9f7193bab111 + image: solsson/kafka:1.1@sha256:ba863ca7dc28563930584e37f93d57c2cbf3f46b1c1fa104fe8af7bcc0c31df4 env: - name: KAFKA_LOG4J_OPTS value: -Dlog4j.configuration=file:/etc/kafka/log4j.properties diff --git a/zookeeper/51zoo.yml b/zookeeper/51zoo.yml index d87ab0de..6868a88d 100644 --- a/zookeeper/51zoo.yml +++ b/zookeeper/51zoo.yml @@ -36,7 +36,7 @@ spec: mountPath: /var/lib/zookeeper/data containers: - name: zookeeper - image: solsson/kafka:1.0.1@sha256:1a4689d49d6274ac59b9b740f51b0408e1c90a9b66d16ad114ee9f7193bab111 + image: solsson/kafka:1.1@sha256:ba863ca7dc28563930584e37f93d57c2cbf3f46b1c1fa104fe8af7bcc0c31df4 env: - name: KAFKA_LOG4J_OPTS value: -Dlog4j.configuration=file:/etc/kafka/log4j.properties From c583a49ff0f7ed9e82e2aafe5674a4555b2716f3 Mon Sep 17 00:00:00 2001 From: Patrick Double Date: Wed, 31 Oct 2018 15:55:23 -0500 Subject: [PATCH 03/16] initial cruise-control --- ...0broker-cruise-control-reporter-config.yml | 12 + cruise-control/11cruise-control-config.yml | 311 ++++++++++++++++++ .../20kafka-broker-reporter-patch.yml | 38 +++ cruise-control/40cruise-control-service.yml | 12 + cruise-control/50cruise-control.yml | 51 +++ 5 files changed, 424 insertions(+) create mode 100644 cruise-control/10broker-cruise-control-reporter-config.yml create mode 100644 cruise-control/11cruise-control-config.yml create mode 100644 cruise-control/20kafka-broker-reporter-patch.yml create mode 100644 cruise-control/40cruise-control-service.yml create mode 100644 cruise-control/50cruise-control.yml diff --git a/cruise-control/10broker-cruise-control-reporter-config.yml b/cruise-control/10broker-cruise-control-reporter-config.yml new file mode 100644 index 00000000..9b6f190b --- /dev/null +++ b/cruise-control/10broker-cruise-control-reporter-config.yml @@ -0,0 +1,12 @@ +kind: ConfigMap +metadata: + name: broker-cruise-control-reporter-config + namespace: kafka +apiVersion: v1 +data: + cruise-control-reporter-init.sh: |- + #!/bin/bash + set -xe + VERSION=2.0.6 + curl -L -o /opt/kafka/libs/extensions/cruise-control-metrics-reporter.jar https://linkedin.jfrog.io/linkedin/cruise-control/com/linkedin/cruisecontrol/cruise-control-metrics-reporter/${VERSION}/cruise-control-metrics-reporter-${VERSION}.jar + echo -e "\n\nmetric.reporters = com.linkedin.kafka.cruisecontrol.metricsreporter.CruiseControlMetricsReporter" >> /etc/kafka/server.properties diff --git a/cruise-control/11cruise-control-config.yml b/cruise-control/11cruise-control-config.yml new file mode 100644 index 00000000..c6b76fe7 --- /dev/null +++ b/cruise-control/11cruise-control-config.yml @@ -0,0 +1,311 @@ +kind: ConfigMap +metadata: + name: broker-cruise-control-config + namespace: kafka +apiVersion: v1 +data: + cruisecontrol.properties: |- + # + # Copyright 2017 LinkedIn Corp. Licensed under the BSD 2-Clause License (the "License"). See License in the project root for license information. + # + + # This is an example property file for Kafka Cruise Control. See KafkaCruiseControlConfig for more details. + + # Configuration for the metadata client. + # ======================================= + + # The Kafka cluster to control. + bootstrap.servers=bootstrap:9092 + + # The maximum interval in milliseconds between two metadata refreshes. + #metadata.max.age.ms=300000 + + # Client id for the Cruise Control. It is used for the metadata client. + #client.id=kafka-cruise-control + + # The size of TCP send buffer bytes for the metadata client. + #send.buffer.bytes=131072 + + # The size of TCP receive buffer size for the metadata client. + #receive.buffer.bytes=131072 + + # The time to wait before disconnect an idle TCP connection. + #connections.max.idle.ms=540000 + + # The time to wait before reconnect to a given host. + #reconnect.backoff.ms=50 + + # The time to wait for a response from a host after sending a request. + #request.timeout.ms=30000 + + + # Configurations for the load monitor + # ======================================= + + # The number of metric fetcher thread to fetch metrics for the Kafka cluster + num.metric.fetchers=1 + + # The metric sampler class + metric.sampler.class=com.linkedin.kafka.cruisecontrol.monitor.sampling.CruiseControlMetricsReporterSampler + # Configurations for CruiseControlMetricsReporterSampler + metric.reporter.topic.pattern=__CruiseControlMetrics + + # The sample store class name + sample.store.class=com.linkedin.kafka.cruisecontrol.monitor.sampling.KafkaSampleStore + + # The config for the Kafka sample store to save the partition metric samples + partition.metric.sample.store.topic=__KafkaCruiseControlPartitionMetricSamples + + # The config for the Kafka sample store to save the model training samples + broker.metric.sample.store.topic=__KafkaCruiseControlModelTrainingSamples + + # The replication factor of Kafka metric sample store topic + sample.store.topic.replication.factor=2 + + # The config for the number of Kafka sample store consumer threads + num.sample.loading.threads=8 + + # The partition assignor class for the metric samplers + metric.sampler.partition.assignor.class=com.linkedin.kafka.cruisecontrol.monitor.sampling.DefaultMetricSamplerPartitionAssignor + + # The metric sampling interval in milliseconds + metric.sampling.interval.ms=120000 + + # The partition metrics window size in milliseconds + partition.metrics.window.ms=300000 + + # The number of partition metric windows to keep in memory + num.partition.metrics.windows=1 + + # The minimum partition metric samples required for a partition in each window + min.samples.per.partition.metrics.window=1 + + # The broker metrics window size in milliseconds + broker.metrics.window.ms=300000 + + # The number of broker metric windows to keep in memory + num.broker.metrics.windows=20 + + # The minimum broker metric samples required for a partition in each window + min.samples.per.broker.metrics.window=1 + + # The configuration for the BrokerCapacityConfigFileResolver (supports JBOD and non-JBOD broker capacities) + capacity.config.file=config/capacity.json + #capacity.config.file=config/capacityJBOD.json + + # Configurations for the analyzer + # ======================================= + + # The list of goals to optimize the Kafka cluster for with pre-computed proposals + default.goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.RackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.PotentialNwOutGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.TopicReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.LeaderBytesInDistributionGoal + + # The list of supported goals + goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.RackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.PotentialNwOutGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.TopicReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.LeaderBytesInDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.kafkaassigner.KafkaAssignerDiskUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.kafkaassigner.KafkaAssignerEvenRackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.PreferredLeaderElectionGoal + + # The list of supported hard goals + hard.goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.RackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal + + # The minimum percentage of well monitored partitions out of all the partitions + min.monitored.partition.percentage=0.95 + + # The balance threshold for CPU + cpu.balance.threshold=1.1 + + # The balance threshold for disk + disk.balance.threshold=1.1 + + # The balance threshold for network inbound utilization + network.inbound.balance.threshold=1.1 + + # The balance threshold for network outbound utilization + network.outbound.balance.threshold=1.1 + + # The balance threshold for the replica count + replica.count.balance.threshold=1.1 + + # The capacity threshold for CPU in percentage + cpu.capacity.threshold=0.8 + + # The capacity threshold for disk in percentage + disk.capacity.threshold=0.8 + + # The capacity threshold for network inbound utilization in percentage + network.inbound.capacity.threshold=0.8 + + # The capacity threshold for network outbound utilization in percentage + network.outbound.capacity.threshold=0.8 + + # The threshold to define the cluster to be in a low CPU utilization state + cpu.low.utilization.threshold=0.0 + + # The threshold to define the cluster to be in a low disk utilization state + disk.low.utilization.threshold=0.0 + + # The threshold to define the cluster to be in a low network inbound utilization state + network.inbound.low.utilization.threshold=0.0 + + # The threshold to define the cluster to be in a low disk utilization state + network.outbound.low.utilization.threshold=0.0 + + # The metric anomaly percentile upper threshold + metric.anomaly.percentile.upper.threshold=90.0 + + # The metric anomaly percentile lower threshold + metric.anomaly.percentile.lower.threshold=10.0 + + # How often should the cached proposal be expired and recalculated if necessary + proposal.expiration.ms=60000 + + # The maximum number of replicas that can reside on a broker at any given time. + max.replicas.per.broker=10000 + + # The number of threads to use for proposal candidate precomputing. + num.proposal.precompute.threads=1 + + # the topics that should be excluded from the partition movement. + #topics.excluded.from.partition.movement + + # Configurations for the executor + # ======================================= + + # The zookeeper connect of the Kafka cluster + zookeeper.connect=zookeeper:2181 + + # The max number of partitions to move in/out on a given broker at a given time. + num.concurrent.partition.movements.per.broker=10 + + # The interval between two execution progress checks. + execution.progress.check.interval.ms=10000 + + + # Configurations for anomaly detector + # ======================================= + + # The goal violation notifier class + anomaly.notifier.class=com.linkedin.kafka.cruisecontrol.detector.notifier.SelfHealingNotifier + + # The metric anomaly finder class + metric.anomaly.finder.class=com.linkedin.kafka.cruisecontrol.detector.KafkaMetricAnomalyFinder + + # The anomaly detection interval + anomaly.detection.interval.ms=10000 + + # The goal violation to detect. + anomaly.detection.goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.RackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal + + # The interested metrics for metric anomaly analyzer. + metric.anomaly.analyzer.metrics=BROKER_PRODUCE_LOCAL_TIME_MS_MAX,BROKER_PRODUCE_LOCAL_TIME_MS_MEAN,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_MAX,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_MEAN,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_MAX,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_MEAN,BROKER_LOG_FLUSH_TIME_MS_MAX,BROKER_LOG_FLUSH_TIME_MS_MEAN + + # The zk path to store failed broker information. + failed.brokers.zk.path=/CruiseControlBrokerList + + # Topic config provider class + topic.config.provider.class=com.linkedin.kafka.cruisecontrol.config.KafkaTopicConfigProvider + + # The cluster configurations for the KafkaTopicConfigProvider + cluster.configs.file=config/clusterConfigs.json + + # Enable self healing for all anomaly detectors, unless the particular anomaly detector is explicitly disabled + self.healing.enabled=true + + # Enable self healing for broker failure detector + #self.healing.broker.failure.enabled=true + + # Enable self healing for goal violation detector + #self.healing.goal.violation.enabled=true + + # Enable self healing for metric anomaly detector + #self.healing.metric.anomaly.enabled=true + + capacityJBOD.json: |- + { + "brokerCapacities":[ + { + "brokerId": "-1", + "capacity": { + "DISK": {"/tmp/kafka-logs-1": "100000", "/tmp/kafka-logs-2": "100000", "/tmp/kafka-logs-3": "50000", + "/tmp/kafka-logs-4": "50000", "/tmp/kafka-logs-5": "150000", "/tmp/kafka-logs-6": "50000"}, + "CPU": "100", + "NW_IN": "10000", + "NW_OUT": "10000" + }, + "doc": "The default capacity for a broker with multiple logDirs each on a separate heterogeneous disk." + }, + { + "brokerId": "0", + "capacity": { + "DISK": {"/tmp/kafka-logs": "500000"}, + "CPU": "100", + "NW_IN": "50000", + "NW_OUT": "50000" + }, + "doc": "This overrides the capacity for broker 0. This broker is not a JBOD broker." + }, + { + "brokerId": "1", + "capacity": { + "DISK": {"/tmp/kafka-logs-1": "250000", "/tmp/kafka-logs-2": "250000"}, + "CPU": "100", + "NW_IN": "50000", + "NW_OUT": "50000" + }, + "doc": "This overrides the capacity for broker 1. This broker is a JBOD broker." + } + ] + } + + capacity.json: |- + { + "brokerCapacities":[ + { + "brokerId": "-1", + "capacity": { + "DISK": "100000", + "CPU": "100", + "NW_IN": "10000", + "NW_OUT": "10000" + }, + "doc": "This is the default capacity. Capacity unit used for disk is in MB, cpu is in percentage, network throughput is in KB." + }, + { + "brokerId": "0", + "capacity": { + "DISK": "500000", + "CPU": "100", + "NW_IN": "50000", + "NW_OUT": "50000" + }, + "doc": "This overrides the capacity for broker 0." + } + ] + } + + clusterConfigs.json: |- + { + "min.insync.replicas": 1, + "an.example.cluster.config": false + } + + log4j2.xml: |- + + + + + + + + + + + + + + + log4j.properties: |- + log4j.rootLogger = INFO, FILE + + log4j.appender.FILE=org.apache.log4j.FileAppender + log4j.appender.FILE.File=/dev/stdout + + log4j.appender.FILE.layout=org.apache.log4j.PatternLayout + log4j.appender.FILE.layout.conversionPattern=%-6r [%15.15t] %-5p %30.30c %x - %m%n diff --git a/cruise-control/20kafka-broker-reporter-patch.yml b/cruise-control/20kafka-broker-reporter-patch.yml new file mode 100644 index 00000000..1f2ec935 --- /dev/null +++ b/cruise-control/20kafka-broker-reporter-patch.yml @@ -0,0 +1,38 @@ +# meant to be applied using +# kubectl --namespace kafka patch statefulset kafka --patch "$(cat cruise-control/20kafka-broker-reporter-patch.yml )" +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: kafka + namespace: kafka +spec: + template: + spec: + initContainers: + - name: cruise-control-reporter + image: hortonworks/alpine-curl:3.1 + command: ['/bin/sh', '/tmp/cruise-control-reporter-configmap/cruise-control-reporter-init.sh'] + volumeMounts: + - name: cruiseconfigmap + mountPath: /tmp/cruise-control-reporter-configmap + - name: config + mountPath: /etc/kafka + - name: extensions + mountPath: /opt/kafka/libs/extensions + $setElementOrder/initContainers: + - name: init-config + - name: cruise-control-reporter + containers: + - name: broker + env: + - name: CLASSPATH + value: /opt/kafka/libs/extensions/cruise-control-metrics-reporter.jar + volumeMounts: + - name: extensions + mountPath: /opt/kafka/libs/extensions + volumes: + - name: cruiseconfigmap + configMap: + name: broker-cruise-control-reporter-config + - name: extensions + emptyDir: {} diff --git a/cruise-control/40cruise-control-service.yml b/cruise-control/40cruise-control-service.yml new file mode 100644 index 00000000..dcb8f243 --- /dev/null +++ b/cruise-control/40cruise-control-service.yml @@ -0,0 +1,12 @@ +kind: Service +apiVersion: v1 +metadata: + name: cruise-control + namespace: kafka +spec: + selector: + app: cruise-control + ports: + - protocol: TCP + port: 8090 + targetPort: 8090 diff --git a/cruise-control/50cruise-control.yml b/cruise-control/50cruise-control.yml new file mode 100644 index 00000000..2b12b98f --- /dev/null +++ b/cruise-control/50cruise-control.yml @@ -0,0 +1,51 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cruise-control + namespace: kafka +spec: + selector: + matchLabels: + app: cruise-control + replicas: 1 + template: + metadata: + labels: + app: cruise-control + annotations: + spec: + terminationGracePeriodSeconds: 30 + initContainers: + - name: init-config + image: alpine:3.8 + command: ['/bin/sh'] + args: [ '-c', 'cp /etc/cruise-control-configmap/* /opt/cruise-control/config'] + volumeMounts: + - name: configmap + mountPath: /etc/cruise-control-configmap + - name: config + mountPath: /opt/cruise-control/config + containers: + - name: cruise-control + image: pdouble16/kafka-cruise-control:2.0.6 + imagePullPolicy: IfNotPresent + ports: + - name: api + containerPort: 8090 + resources: + requests: + cpu: 100m + memory: 512Mi + readinessProbe: + tcpSocket: + port: 8090 + timeoutSeconds: 1 + volumeMounts: + - name: config + mountPath: /opt/cruise-control/config + volumes: + - name: configmap + configMap: + name: broker-cruise-control-config + - name: config + emptyDir: {} From 28e392bb5797aec9b155438f46193a8305e8d651 Mon Sep 17 00:00:00 2001 From: Patrick Double Date: Tue, 18 Dec 2018 10:26:25 -0600 Subject: [PATCH 04/16] Cruise control 2.0.17 --- cruise-control/50cruise-control.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cruise-control/50cruise-control.yml b/cruise-control/50cruise-control.yml index 2b12b98f..93535b42 100644 --- a/cruise-control/50cruise-control.yml +++ b/cruise-control/50cruise-control.yml @@ -27,7 +27,7 @@ spec: mountPath: /opt/cruise-control/config containers: - name: cruise-control - image: pdouble16/kafka-cruise-control:2.0.6 + image: pdouble16/kafka-cruise-control-docker:2.0.17 imagePullPolicy: IfNotPresent ports: - name: api From ff34f96dfd6de538867c81f6f9f4f4713bc081be Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Wed, 19 Dec 2018 21:46:31 +0100 Subject: [PATCH 05/16] Prepares for merge to master --- kafka/50kafka.yml | 2 +- zookeeper/50pzoo.yml | 2 +- zookeeper/51zoo.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kafka/50kafka.yml b/kafka/50kafka.yml index 4315264a..378ce85c 100644 --- a/kafka/50kafka.yml +++ b/kafka/50kafka.yml @@ -42,7 +42,7 @@ spec: mountPath: /etc/kafka containers: - name: broker - image: solsson/kafka:1.1@sha256:ba863ca7dc28563930584e37f93d57c2cbf3f46b1c1fa104fe8af7bcc0c31df4 + image: solsson/kafka:2.1.0@sha256:ac3f06d87d45c7be727863f31e79fbfdcb9c610b51ba9cf03c75a95d602f15e1 env: - name: KAFKA_LOG4J_OPTS value: -Dlog4j.configuration=file:/etc/kafka/log4j.properties diff --git a/zookeeper/50pzoo.yml b/zookeeper/50pzoo.yml index ca6debc6..69de9832 100644 --- a/zookeeper/50pzoo.yml +++ b/zookeeper/50pzoo.yml @@ -33,7 +33,7 @@ spec: mountPath: /var/lib/zookeeper/data containers: - name: zookeeper - image: solsson/kafka:1.1@sha256:ba863ca7dc28563930584e37f93d57c2cbf3f46b1c1fa104fe8af7bcc0c31df4 + image: solsson/kafka:2.1.0@sha256:ac3f06d87d45c7be727863f31e79fbfdcb9c610b51ba9cf03c75a95d602f15e1 env: - name: KAFKA_LOG4J_OPTS value: -Dlog4j.configuration=file:/etc/kafka/log4j.properties diff --git a/zookeeper/51zoo.yml b/zookeeper/51zoo.yml index 6868a88d..95b5f23a 100644 --- a/zookeeper/51zoo.yml +++ b/zookeeper/51zoo.yml @@ -36,7 +36,7 @@ spec: mountPath: /var/lib/zookeeper/data containers: - name: zookeeper - image: solsson/kafka:1.1@sha256:ba863ca7dc28563930584e37f93d57c2cbf3f46b1c1fa104fe8af7bcc0c31df4 + image: solsson/kafka:2.1.0@sha256:ac3f06d87d45c7be727863f31e79fbfdcb9c610b51ba9cf03c75a95d602f15e1 env: - name: KAFKA_LOG4J_OPTS value: -Dlog4j.configuration=file:/etc/kafka/log4j.properties From e7fc38f0105878bfd57e36324232cd6f40186bfa Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Thu, 20 Dec 2018 08:23:53 +0100 Subject: [PATCH 06/16] Creates the topic that Cruise Control needs at start --- cruise-control/topic-create.yml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 cruise-control/topic-create.yml diff --git a/cruise-control/topic-create.yml b/cruise-control/topic-create.yml new file mode 100644 index 00000000..48fae0fd --- /dev/null +++ b/cruise-control/topic-create.yml @@ -0,0 +1,28 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: topic-cruise-control-metrics + namespace: kafka +spec: + template: + spec: + containers: + - name: topic-create + image: solsson/kafka:2.1.0@sha256:ac3f06d87d45c7be727863f31e79fbfdcb9c610b51ba9cf03c75a95d602f15e1 + command: + - ./bin/kafka-topics.sh + - --zookeeper + - zookeeper.kafka.svc.cluster.local:2181 + - --create + - --if-not-exists + - --topic + - __CruiseControlMetrics + - --partitions + - '12' + - --replication-factor + - '3' + resources: + limits: + cpu: 200m + memory: 100Mi + restartPolicy: Never From 3003ef4655428006aef8a2fb1c28620b28376557 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Thu, 20 Dec 2018 21:13:54 +0100 Subject: [PATCH 07/16] Current config from the migrate_to_kafka_2_0 branch but with the changes from the original PR preserved, except maybe self healing --- cruise-control/11cruise-control-config.yml | 74 +++++++++++++++++++++- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/cruise-control/11cruise-control-config.yml b/cruise-control/11cruise-control-config.yml index c6b76fe7..da6748bb 100644 --- a/cruise-control/11cruise-control-config.yml +++ b/cruise-control/11cruise-control-config.yml @@ -169,7 +169,7 @@ data: # ======================================= # The zookeeper connect of the Kafka cluster - zookeeper.connect=zookeeper:2181 + zookeeper.connect=zookeeper:2181/ # The max number of partitions to move in/out on a given broker at a given time. num.concurrent.partition.movements.per.broker=10 @@ -196,6 +196,9 @@ data: # The interested metrics for metric anomaly analyzer. metric.anomaly.analyzer.metrics=BROKER_PRODUCE_LOCAL_TIME_MS_MAX,BROKER_PRODUCE_LOCAL_TIME_MS_MEAN,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_MAX,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_MEAN,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_MAX,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_MEAN,BROKER_LOG_FLUSH_TIME_MS_MAX,BROKER_LOG_FLUSH_TIME_MS_MEAN + ## Adjust accordingly if your metrics reporter is an older version and does not produce these metrics. + #metric.anomaly.analyzer.metrics=BROKER_PRODUCE_LOCAL_TIME_MS_50TH,BROKER_PRODUCE_LOCAL_TIME_MS_999TH,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_50TH,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_999TH,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_50TH,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_999TH,BROKER_LOG_FLUSH_TIME_MS_50TH,BROKER_LOG_FLUSH_TIME_MS_999TH + # The zk path to store failed broker information. failed.brokers.zk.path=/CruiseControlBrokerList @@ -205,8 +208,20 @@ data: # The cluster configurations for the KafkaTopicConfigProvider cluster.configs.file=config/clusterConfigs.json + # The maximum time in milliseconds to store the response and access details of a completed user task. + completed.user.task.retention.time.ms=21600000 + + # The maximum time in milliseconds to retain the demotion history of brokers. + demotion.history.retention.time.ms=86400000 + + # The maximum number of completed user tasks for which the response and access details will be cached. + max.cached.completed.user.tasks=100 + + # The maximum number of user tasks for concurrently running in async endpoints across all users. + max.active.user.tasks=5 + # Enable self healing for all anomaly detectors, unless the particular anomaly detector is explicitly disabled - self.healing.enabled=true + self.healing.enabled=false # Enable self healing for broker failure detector #self.healing.broker.failure.enabled=true @@ -217,6 +232,61 @@ data: # Enable self healing for metric anomaly detector #self.healing.metric.anomaly.enabled=true + + # configurations for the webserver + # ================================ + + # HTTP listen port + webserver.http.port=9090 + + # HTTP listen address + webserver.http.address=0.0.0.0 + + # Whether CORS support is enabled for API or not + webserver.http.cors.enabled=false + + # Value for Access-Control-Allow-Origin + webserver.http.cors.origin=http://localhost:8080/ + + # Value for Access-Control-Request-Method + webserver.http.cors.allowmethods=OPTIONS,GET,POST + + # Headers that should be exposed to the Browser (Webapp) + # This is a special header that is used by the + # User Tasks subsystem and should be explicitly + # Enabled when CORS mode is used as part of the + # Admin Interface + webserver.http.cors.exposeheaders=User-Task-ID + + # REST API default prefix + # (dont forget the ending *) + webserver.api.urlprefix=/kafkacruisecontrol/* + + # Location where the Cruise Control frontend is deployed + webserver.ui.diskpath=./cruise-control-ui/dist/ + + # URL path prefix for UI + # (dont forget the ending *) + webserver.ui.urlprefix=/* + + # Time After which request is converted to Async + webserver.request.maxBlockTimeMs=10000 + + # Default Session Expiry Period + webserver.session.maxExpiryTimeMs=60000 + + # Session cookie path + webserver.session.path=/ + + # Server Access Logs + webserver.accesslog.enabled=true + + # Location of HTTP Request Logs + webserver.accesslog.path=access.log + + # HTTP Request Log retention days + webserver.accesslog.retention.days=14 + capacityJBOD.json: |- { "brokerCapacities":[ From 863e2ac8af335b83d5ac9178699bbdf7e9160fc6 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Thu, 20 Dec 2018 21:19:11 +0100 Subject: [PATCH 08/16] We use min.insync.replicas=2 since https://github.com/Yolean/kubernetes-kafka/pull/107 --- cruise-control/11cruise-control-config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cruise-control/11cruise-control-config.yml b/cruise-control/11cruise-control-config.yml index da6748bb..e7d11116 100644 --- a/cruise-control/11cruise-control-config.yml +++ b/cruise-control/11cruise-control-config.yml @@ -352,7 +352,7 @@ data: clusterConfigs.json: |- { - "min.insync.replicas": 1, + "min.insync.replicas": 2, "an.example.cluster.config": false } From 13ed6ebe49973d98e1a0c1078506eb17737f79e7 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Thu, 20 Dec 2018 21:40:54 +0100 Subject: [PATCH 09/16] Avoids curl dowload at runtime because it might prevent broker restart if the remote server is down Using image from https://github.com/StreamingMicroservicesPlatform/docker-kafka/pull/8 --- .../10broker-cruise-control-reporter-config.yml | 12 ------------ cruise-control/20kafka-broker-reporter-patch.yml | 14 +++++++------- 2 files changed, 7 insertions(+), 19 deletions(-) delete mode 100644 cruise-control/10broker-cruise-control-reporter-config.yml diff --git a/cruise-control/10broker-cruise-control-reporter-config.yml b/cruise-control/10broker-cruise-control-reporter-config.yml deleted file mode 100644 index 9b6f190b..00000000 --- a/cruise-control/10broker-cruise-control-reporter-config.yml +++ /dev/null @@ -1,12 +0,0 @@ -kind: ConfigMap -metadata: - name: broker-cruise-control-reporter-config - namespace: kafka -apiVersion: v1 -data: - cruise-control-reporter-init.sh: |- - #!/bin/bash - set -xe - VERSION=2.0.6 - curl -L -o /opt/kafka/libs/extensions/cruise-control-metrics-reporter.jar https://linkedin.jfrog.io/linkedin/cruise-control/com/linkedin/cruisecontrol/cruise-control-metrics-reporter/${VERSION}/cruise-control-metrics-reporter-${VERSION}.jar - echo -e "\n\nmetric.reporters = com.linkedin.kafka.cruisecontrol.metricsreporter.CruiseControlMetricsReporter" >> /etc/kafka/server.properties diff --git a/cruise-control/20kafka-broker-reporter-patch.yml b/cruise-control/20kafka-broker-reporter-patch.yml index 1f2ec935..a171f5bf 100644 --- a/cruise-control/20kafka-broker-reporter-patch.yml +++ b/cruise-control/20kafka-broker-reporter-patch.yml @@ -10,11 +10,14 @@ spec: spec: initContainers: - name: cruise-control-reporter - image: hortonworks/alpine-curl:3.1 - command: ['/bin/sh', '/tmp/cruise-control-reporter-configmap/cruise-control-reporter-init.sh'] + image: solsson/kafka-cruise-control-jar@sha256:20a26ae1fab5d63592143093e460d06c4d8f1bef666d592b6393a3b41e3743e8 + command: + - /bin/sh + - -cex + - | + cp -v /cruise-control-metrics-reporter.jar /opt/kafka/libs/extensions/cruise-control-metrics-reporter.jar + echo -e "\n\nmetric.reporters = com.linkedin.kafka.cruisecontrol.metricsreporter.CruiseControlMetricsReporter" | tee -a /etc/kafka/server.properties volumeMounts: - - name: cruiseconfigmap - mountPath: /tmp/cruise-control-reporter-configmap - name: config mountPath: /etc/kafka - name: extensions @@ -31,8 +34,5 @@ spec: - name: extensions mountPath: /opt/kafka/libs/extensions volumes: - - name: cruiseconfigmap - configMap: - name: broker-cruise-control-reporter-config - name: extensions emptyDir: {} From be6eaed72f6a2157cb8f6be4f97118916534c761 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Thu, 20 Dec 2018 21:54:03 +0100 Subject: [PATCH 10/16] Prevents deletion of kafka at delete -f cruise-control/ --- cruise-control/20kafka-broker-reporter-patch.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/cruise-control/20kafka-broker-reporter-patch.yml b/cruise-control/20kafka-broker-reporter-patch.yml index a171f5bf..1ae5471a 100644 --- a/cruise-control/20kafka-broker-reporter-patch.yml +++ b/cruise-control/20kafka-broker-reporter-patch.yml @@ -1,7 +1,5 @@ # meant to be applied using # kubectl --namespace kafka patch statefulset kafka --patch "$(cat cruise-control/20kafka-broker-reporter-patch.yml )" -apiVersion: apps/v1 -kind: StatefulSet metadata: name: kafka namespace: kafka From da75c0aeb4a4e87d09e8989153782a090a80700d Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Fri, 21 Dec 2018 06:19:52 +0100 Subject: [PATCH 11/16] Now try self healing --- cruise-control/11cruise-control-config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cruise-control/11cruise-control-config.yml b/cruise-control/11cruise-control-config.yml index e7d11116..0b0a0eee 100644 --- a/cruise-control/11cruise-control-config.yml +++ b/cruise-control/11cruise-control-config.yml @@ -221,7 +221,7 @@ data: max.active.user.tasks=5 # Enable self healing for all anomaly detectors, unless the particular anomaly detector is explicitly disabled - self.healing.enabled=false + self.healing.enabled=true # Enable self healing for broker failure detector #self.healing.broker.failure.enabled=true From e5eec5b1a8534ea6bbfa9b6cc902ba76373fcb87 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Fri, 21 Dec 2018 06:58:43 +0100 Subject: [PATCH 12/16] Always use exact images, for stability and security --- cruise-control/50cruise-control.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cruise-control/50cruise-control.yml b/cruise-control/50cruise-control.yml index 93535b42..dbf1eba9 100644 --- a/cruise-control/50cruise-control.yml +++ b/cruise-control/50cruise-control.yml @@ -17,7 +17,7 @@ spec: terminationGracePeriodSeconds: 30 initContainers: - name: init-config - image: alpine:3.8 + image: busybox@sha256:2a03a6059f21e150ae84b0973863609494aad70f0a80eaeb64bddd8d92465812 command: ['/bin/sh'] args: [ '-c', 'cp /etc/cruise-control-configmap/* /opt/cruise-control/config'] volumeMounts: @@ -27,7 +27,7 @@ spec: mountPath: /opt/cruise-control/config containers: - name: cruise-control - image: pdouble16/kafka-cruise-control-docker:2.0.17 + image: pdouble16/kafka-cruise-control-docker:2.0.17@sha256:2bf22a47a928689b94f0771cee4be557dff35948269d31a348bc1e3055ea2336 imagePullPolicy: IfNotPresent ports: - name: api From ab8d0e80b7c4c263f30fdb05734d6469389c154c Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Sat, 22 Dec 2018 21:34:45 +0100 Subject: [PATCH 13/16] Moves the extensions mechanism to kafka core --- cruise-control/20kafka-broker-reporter-patch.yml | 11 ----------- kafka/50kafka.yml | 8 ++++++++ 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/cruise-control/20kafka-broker-reporter-patch.yml b/cruise-control/20kafka-broker-reporter-patch.yml index 1ae5471a..bfb8943a 100644 --- a/cruise-control/20kafka-broker-reporter-patch.yml +++ b/cruise-control/20kafka-broker-reporter-patch.yml @@ -23,14 +23,3 @@ spec: $setElementOrder/initContainers: - name: init-config - name: cruise-control-reporter - containers: - - name: broker - env: - - name: CLASSPATH - value: /opt/kafka/libs/extensions/cruise-control-metrics-reporter.jar - volumeMounts: - - name: extensions - mountPath: /opt/kafka/libs/extensions - volumes: - - name: extensions - emptyDir: {} diff --git a/kafka/50kafka.yml b/kafka/50kafka.yml index 36cf64dc..93a17bef 100644 --- a/kafka/50kafka.yml +++ b/kafka/50kafka.yml @@ -41,10 +41,14 @@ spec: mountPath: /etc/kafka-configmap - name: config mountPath: /etc/kafka + - name: extensions + mountPath: /opt/kafka/libs/extensions containers: - name: broker image: solsson/kafka:2.1.0@sha256:ac3f06d87d45c7be727863f31e79fbfdcb9c610b51ba9cf03c75a95d602f15e1 env: + - name: CLASSPATH + value: /opt/kafka/libs/extensions/* - name: KAFKA_LOG4J_OPTS value: -Dlog4j.configuration=file:/etc/kafka/log4j.properties - name: JMX_PORT @@ -81,12 +85,16 @@ spec: mountPath: /etc/kafka - name: data mountPath: /var/lib/kafka/data + - name: extensions + mountPath: /opt/kafka/libs/extensions volumes: - name: configmap configMap: name: broker-config - name: config emptyDir: {} + - name: extensions + emptyDir: {} volumeClaimTemplates: - metadata: name: data From 432f1e8b55a6289e4dd6b4f704982a76d5d092e2 Mon Sep 17 00:00:00 2001 From: Staffan Olsson Date: Sat, 22 Dec 2018 21:35:54 +0100 Subject: [PATCH 14/16] Cruise Control on Java 11 and the metrics jar copied from the same image --- cruise-control/20kafka-broker-reporter-patch.yml | 4 ++-- cruise-control/50cruise-control.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cruise-control/20kafka-broker-reporter-patch.yml b/cruise-control/20kafka-broker-reporter-patch.yml index bfb8943a..e349a410 100644 --- a/cruise-control/20kafka-broker-reporter-patch.yml +++ b/cruise-control/20kafka-broker-reporter-patch.yml @@ -8,12 +8,12 @@ spec: spec: initContainers: - name: cruise-control-reporter - image: solsson/kafka-cruise-control-jar@sha256:20a26ae1fab5d63592143093e460d06c4d8f1bef666d592b6393a3b41e3743e8 + image: solsson/kafka-cruise-control@sha256:af4af35cd1c44b2256e96246a98350ff3e53f64a3061de1a5b0c2a2e9f8e2d8c command: - /bin/sh - -cex - | - cp -v /cruise-control-metrics-reporter.jar /opt/kafka/libs/extensions/cruise-control-metrics-reporter.jar + cp -v /opt/cruise-control/cruise-control/build/dependant-libs/cruise-control-metrics-reporter.jar /opt/kafka/libs/extensions/cruise-control-metrics-reporter.jar echo -e "\n\nmetric.reporters = com.linkedin.kafka.cruisecontrol.metricsreporter.CruiseControlMetricsReporter" | tee -a /etc/kafka/server.properties volumeMounts: - name: config diff --git a/cruise-control/50cruise-control.yml b/cruise-control/50cruise-control.yml index dbf1eba9..0713bee1 100644 --- a/cruise-control/50cruise-control.yml +++ b/cruise-control/50cruise-control.yml @@ -27,7 +27,7 @@ spec: mountPath: /opt/cruise-control/config containers: - name: cruise-control - image: pdouble16/kafka-cruise-control-docker:2.0.17@sha256:2bf22a47a928689b94f0771cee4be557dff35948269d31a348bc1e3055ea2336 + image: solsson/kafka-cruise-control@sha256:af4af35cd1c44b2256e96246a98350ff3e53f64a3061de1a5b0c2a2e9f8e2d8c imagePullPolicy: IfNotPresent ports: - name: api From 7cf81da6c2234e92c044d8b599467ad939bd445f Mon Sep 17 00:00:00 2001 From: Patrick Double Date: Wed, 26 Dec 2018 09:16:20 -0600 Subject: [PATCH 15/16] Add readme --- cruise-control/README.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 cruise-control/README.md diff --git a/cruise-control/README.md b/cruise-control/README.md new file mode 100644 index 00000000..26883068 --- /dev/null +++ b/cruise-control/README.md @@ -0,0 +1,5 @@ +## Cruise Control + +Cruise Control is used to automate the dynamic workload rebalance and self-healing of a Kafka cluster. This tool will allow you to add, replace or remove nodes and the cluster will be automatically adjusted. Partitions will be rebalanced based on resource usage. + +The default configuration has self healing enabled. From 1d5299665fbbdcd765a35a36dffee0d95665be02 Mon Sep 17 00:00:00 2001 From: Patrick Double Date: Sat, 5 Jan 2019 11:02:05 -0600 Subject: [PATCH 16/16] Improve README, minor config changes --- cruise-control/11cruise-control-config.yml | 3 +-- .../20kafka-broker-reporter-patch.yml | 2 +- cruise-control/README.md | 25 +++++++++++++++++-- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/cruise-control/11cruise-control-config.yml b/cruise-control/11cruise-control-config.yml index 0b0a0eee..b964d199 100644 --- a/cruise-control/11cruise-control-config.yml +++ b/cruise-control/11cruise-control-config.yml @@ -352,8 +352,7 @@ data: clusterConfigs.json: |- { - "min.insync.replicas": 2, - "an.example.cluster.config": false + "min.insync.replicas": 2 } log4j2.xml: |- diff --git a/cruise-control/20kafka-broker-reporter-patch.yml b/cruise-control/20kafka-broker-reporter-patch.yml index e349a410..9467062e 100644 --- a/cruise-control/20kafka-broker-reporter-patch.yml +++ b/cruise-control/20kafka-broker-reporter-patch.yml @@ -1,5 +1,5 @@ # meant to be applied using -# kubectl --namespace kafka patch statefulset kafka --patch "$(cat cruise-control/20kafka-broker-reporter-patch.yml )" +# kubectl --namespace kafka patch statefulset kafka --patch "$(cat cruise-control/20kafka-broker-reporter-patch.yml)" metadata: name: kafka namespace: kafka diff --git a/cruise-control/README.md b/cruise-control/README.md index 26883068..c27ba2f1 100644 --- a/cruise-control/README.md +++ b/cruise-control/README.md @@ -1,5 +1,26 @@ ## Cruise Control -Cruise Control is used to automate the dynamic workload rebalance and self-healing of a Kafka cluster. This tool will allow you to add, replace or remove nodes and the cluster will be automatically adjusted. Partitions will be rebalanced based on resource usage. +Cruise Control is used to automate the dynamic workload rebalance and self-healing of a Kafka cluster. This tool will allow you to add, replace or remove nodes and the cluster will be automatically adjusted. Partitions will be rebalanced based on resource usage of CPU, network, disk, etc. -The default configuration has self healing enabled. +*Disclaimer*: It is important to understand Cruise Control will modify the Kafka cluster without operator intervention. Bugs or misconfiguration may cause loss of data or denial of service. You bear the responsibility of configuring and testing properly and taking precautions based on the importance of your data. + +### Configuration + +There are several configuration files that need to be mounted in `/opt/cruise-control/config`. The files in `11cruise-control-config.yml` are the defaults from [the Cruise Control GitHub repo, migrate_to_kafka_2_0 branch](https://github.com/linkedin/cruise-control/tree/migrate_to_kafka_2_0/config). The significant modification from the GitHub repo is that self healing has been enabled using `self.healing.enabled=true`. + +Following are the files in `11cruise-control-config.yml`. Nearly all changes you would make are in `cruisecontrol.properties`. + +- cruisecontrol.properties +- capacityJBOD.json +- capacity.json +- clusterConfigs.json +- log4j2.xml +- log4j.properties + +### Patching + +Cruise control requires broker metrics to make informed decisions. Each broker runs a metric collector that pushes metrics into a topic, by default named `__CruiseControlMetrics`. Configuring the collector requires patching the broker StatefulSet. An example command to apply this patch is below. + +```shell +$ kubectl --namespace kafka patch statefulset kafka --patch "$(cat cruise-control/20kafka-broker-reporter-patch.yml)" +```