diff --git a/CHANGELOG.md b/CHANGELOG.md index f37b059d..1a3b3f57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Refactors eoapi-support into core eoapi chart [#262](https://github.com/developmentseed/eoapi-k8s/pull/262) + ## [0.7.13] - 2025-11-04 ### Added @@ -44,10 +48,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Adjusted Renovate Configuration to fit conventional commits [#295](https://github.com/developmentseed/eoapi-k8s/pull/295) - Notification triggers in database [#289](https://github.com/developmentseed/eoapi-k8s/pull/289) -### Changed - -- Excluded renovate.json from CHANGELOG.md edits [#301](https://github.com/developmentseed/eoapi-k8s/pull/301) - ## [0.7.8] - 2025-09-10 ### Added diff --git a/charts/eoapi-support/.gitignore b/charts/eoapi-support/.gitignore deleted file mode 100644 index 082a7414..00000000 --- a/charts/eoapi-support/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -Chart.lock -/charts diff --git a/charts/eoapi-support/.helmignore b/charts/eoapi-support/.helmignore deleted file mode 100644 index ada987c3..00000000 --- a/charts/eoapi-support/.helmignore +++ /dev/null @@ -1,30 +0,0 @@ -# Non default entries manually added by support developers - -# Ignore the .yaml that generates the .json, only the .json is relevant to -# bundle with the Helm chart when it is packaged or "helm dep up" is used to -# copy it over to another location where it is referenced. -values.schema.yaml - -# ----------------------------------------------------------------------------- - -# Patterns to ignore when building packages. -# This supports shell glob matching, relative path matching, and -# negation (prefixed with !). Only one pattern per line. -.DS_Store -# Common VCS dirs -.git/ -.gitignore -.bzr/ -.bzrignore -.hg/ -.hgignore -.svn/ -# Common backup files -*.swp -*.bak -*.tmp -*~ -# Various IDEs -.project -.idea/ -*.tmproj diff --git a/charts/eoapi-support/Chart.yaml b/charts/eoapi-support/Chart.yaml deleted file mode 100644 index bbaaedd1..00000000 --- a/charts/eoapi-support/Chart.yaml +++ /dev/null @@ -1,33 +0,0 @@ -apiVersion: v2 -name: eoapi-support - -appVersion: "0.1.7" -version: "0.1.7" - -dependencies: - - name: metrics-server - version: 7.4.12 - repository: https://charts.bitnami.com/bitnami - - # Prometheus for collection of metrics. - # https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus - # - - name: prometheus - # NOTE: configuration for this dependency is handled in `eoapi-support/values.yaml.prometheus` values - version: 27.45.0 - repository: https://prometheus-community.github.io/helm-charts - - # used to create custom metrics to autoscale on - # - - name: prometheus-adapter - # NOTE: configuration for this dependency is handled in `eoapi-support/values.yaml.prometheus-adapter` values - version: 5.2.0 - repository: https://prometheus-community.github.io/helm-charts - - # Grafana for dashboarding of metrics - # https://github.com/grafana/helm-charts/tree/main/charts/grafana - # - - name: grafana - # NOTE: configuration for this dependency is handled in `eoapi-support/values.yaml.grafana` values - version: 10.1.4 - repository: https://grafana.github.io/helm-charts diff --git a/charts/eoapi-support/README.md b/charts/eoapi-support/README.md deleted file mode 100644 index b218eb69..00000000 --- a/charts/eoapi-support/README.md +++ /dev/null @@ -1,5 +0,0 @@ -#### eoAPI Support - -observability, monitoring and some custom metrics for autoscaling - -(please see documentation about `helm install` and configuration at ../../docs/autoscaling.md) diff --git a/charts/eoapi-support/values.yaml b/charts/eoapi-support/values.yaml deleted file mode 100644 index febe6af3..00000000 --- a/charts/eoapi-support/values.yaml +++ /dev/null @@ -1,178 +0,0 @@ -# most of this was cribbed from https://github.com/2i2c-org/infrastructure/blob/master/helm-charts/support/ -# so giving props where props are due to Yuvi Panda :sparkles: -prometheus-adapter: - prometheus: - # NOTE: the `url` below makes assumptions about release name and namespace: - # 1) Release name is "eoapi-support" (follows RELEASE_NAME-prometheus-server pattern) - # 2) Deployed in "eoapi" namespace - # 3) If using different release name, update to: http://YOUR_RELEASE_NAME-prometheus-server.YOUR_NAMESPACE.svc.cluster.local - url: http://eoapi-support-prometheus-server.eoapi.svc.cluster.local - port: 80 - path: "" - rules: - default: false - # NOTE: the `name.as` values below make some assumptions about your release name - # namely that you have run `helm install eoapi eoapi/eoapi --create-namespace=eoapi` - custom: - - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' - seriesFilters: [] - resources: - template: <<.Resource>> - name: - matches: "" - as: "nginx_ingress_controller_requests_rate_vector_eoapi" - metricsQuery: round(sum(rate(<<.Series>>{service="vector",path=~"/vector.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) - - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' - seriesFilters: [] - resources: - template: <<.Resource>> - name: - matches: "" - as: "nginx_ingress_controller_requests_rate_raster_eoapi" - metricsQuery: round(sum(rate(<<.Series>>{service="raster",path=~"/raster.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) - - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' - seriesFilters: [] - resources: - template: <<.Resource>> - name: - matches: "" - as: "nginx_ingress_controller_requests_rate_stac_eoapi" - metricsQuery: round(sum(rate(<<.Series>>{service="stac",path=~"/stac.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) - -prometheus: - # alertmanager is an optional prometheus chart dependency that we opt-out from - # as we favor Grafana for this functionality. Grafana provides alerts and does - # so with a better UI that we expose publicly behind auth anyhow. - # - alertmanager: - enabled: false - - # prometheus-pushgateway is an optional prometheus chart dependency that we - # opt-out from. pushgateway provides a way to complement prometheus server's - # behavior of scraping metrics from services by allowing services to push - # metrics to prometheus. - # - prometheus-pushgateway: - enabled: false - - # kube-state-metrics is deployed by default but listing here just so we know it is - kube-state-metrics: - enabled: true - - # prometheus-node-exporter is an optional prometheus chart dependency that we - # rely on to collect metrics about the nodes - # - # values ref: https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-node-exporter/values.yaml - # - prometheus-node-exporter: - # resources for the node-exporter was set after inspecting cpu and memory - # use via prometheus and grafana. - # - # node-exporter is typically found using between 0-3m CPU and 2-22Mi memory, - # but we've seen it fail to report cpu/memory use metrics from time to time - # when requesting and limiting to 5m, so we've increased requests/limit it - # to 10m. - # - # PromQL queries for CPU and memory use: - # - CPU: sum(rate(container_cpu_usage_seconds_total{container="node-exporter", namespace="support"}[5m])) by (pod) - # - Memory: sum(container_memory_usage_bytes{container="node-exporter", namespace="support"}) by (pod) - # - resources: - limits: - cpu: 10m - memory: 30Mi - requests: - cpu: 10m - memory: 30Mi - server: - service: - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: "nlb" - service.beta.kubernetes.io/aws-load-balancer-internal: "false" - type: LoadBalancer - -grafana: - persistence: - enabled: false - deploymentStrategy: - type: Recreate - service: - type: LoadBalancer - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: "nlb" - service.beta.kubernetes.io/aws-load-balancer-internal: "false" - rbac: - namespaced: true - pspEnabled: false - # initChownData refers to an init container enabled by default that isn't - # needed as we don't reconfigure the linux user the grafana server will run - # as. - initChownData: - enabled: false - - # resources for grafana was set after inspecting cpu and memory use via - # prometheus and grafana. - # - # Grafana's memory use seems to increase over time but seems reasonable to - # stay below 200Mi for years to come. Grafana's CPU use seems miniscule with - # peaks at up to 9m CPU from one user is browsing its dashboards. - # - # PromQL queries for CPU and memory use: - # - CPU: sum(rate(container_cpu_usage_seconds_total{container="grafana", namespace="support"}[5m])) by (pod) - # - Memory: sum(container_memory_usage_bytes{container="grafana", namespace="support"}) by (pod) - # - resources: - limits: - cpu: 100m - memory: 200Mi - requests: - cpu: 10m - memory: 200Mi - - datasources: - datasources.yaml: - apiVersion: 1 - datasources: - # Automatically add the prometheus server in the same namespace as the grafana as a datasource - - name: prometheus - orgId: 1 - type: prometheus - # NOTE: the `url` below makes assumptions about release name and namespace: - # 1) Release name is "eoapi-support" (follows RELEASE_NAME-prometheus-server pattern) - # 2) Deployed in "eoapi" namespace - # 3) If using different release name, update to: http://YOUR_RELEASE_NAME-prometheus-server.YOUR_NAMESPACE.svc.cluster.local - url: http://eoapi-support-prometheus-server.eoapi.svc.cluster.local - access: proxy - jsonData: - timeInterval: "5s" - isDefault: true - editable: true - version: 1 # This number should be increased when changes are made to update the datasource - - dashboardProviders: - dashboardproviders.yaml: - apiVersion: 1 - providers: - - name: 'default' - orgId: 1 - folder: '' - type: file - disableDeletion: false - editable: true - options: - path: /var/lib/grafana/dashboards/default - - dashboardsConfigMaps: - # NOTE: This must match the ConfigMap name created in templates/dashboard.config.yaml - # The template creates: {{ .Release.Name }}-dashboards - # If release name is "eoapi-support", this should be "eoapi-support-dashboards" - # Update this value to match your actual release name + "-dashboards" - default: "eoapi-support-dashboards" - -metrics-server: - image: - registry: docker.io - repository: bitnamilegacy/metrics-server - tag: "0.8.0-debian-12-r4" - apiService: - create: true diff --git a/charts/eoapi/Chart.yaml b/charts/eoapi/Chart.yaml index 8f682c90..becf6717 100644 --- a/charts/eoapi/Chart.yaml +++ b/charts/eoapi/Chart.yaml @@ -61,3 +61,19 @@ dependencies: version: v1.20.0 repository: https://knative.github.io/operator condition: knative.enabled + - name: metrics-server + version: 7.4.12 + repository: https://charts.bitnami.com/bitnami + condition: monitoring.metricsServer.enabled + - name: prometheus + version: 27.45.0 + repository: https://prometheus-community.github.io/helm-charts + condition: monitoring.prometheus.enabled + - name: prometheus-adapter + version: 5.2.0 + repository: https://prometheus-community.github.io/helm-charts + condition: monitoring.prometheusAdapter.enabled + - name: grafana + version: 10.1.4 + repository: https://grafana.github.io/helm-charts + condition: observability.grafana.enabled diff --git a/charts/eoapi/README.md b/charts/eoapi/README.md index 8b569096..6a0da6f0 100644 --- a/charts/eoapi/README.md +++ b/charts/eoapi/README.md @@ -14,6 +14,8 @@ A Helm chart for deploying Earth Observation APIs with integrated STAC, raster, - Flexible database configuration - Real-time PostgreSQL notifications for STAC item changes - Unified ingress system +- Autoscaling +- Integrated observability (Prometheus & Grafana) ## TL;DR diff --git a/charts/eoapi-support/dashboards/eoAPI-Dashboard.json b/charts/eoapi/dashboards/eoAPI-Dashboard.json similarity index 100% rename from charts/eoapi-support/dashboards/eoAPI-Dashboard.json rename to charts/eoapi/dashboards/eoAPI-Dashboard.json diff --git a/charts/eoapi/templates/_monitoring.yaml b/charts/eoapi/templates/_monitoring.yaml new file mode 100644 index 00000000..8acfb713 --- /dev/null +++ b/charts/eoapi/templates/_monitoring.yaml @@ -0,0 +1,40 @@ +{{/* +Common monitoring configurations to avoid duplication across values files +*/}} + +{{/* +Standard monitoring configuration with environment-specific settings +Usage: {{ include "eoapi.monitoring.config" (dict "context" . "environment" "production" "persistence" true) }} +Environments: basic, production, testing +*/}} +{{- define "eoapi.monitoring.config" -}} +{{- $ctx := .context -}} +{{- $env := .environment | default "basic" -}} +{{- $persistence := .persistence | default false -}} +metricsServer: + enabled: true + apiService: + create: true +prometheus: + enabled: true + alertmanager: + enabled: {{ if eq $env "production" }}true{{ else }}false{{ end }} + prometheus-pushgateway: + enabled: false + kube-state-metrics: + enabled: true + prometheus-node-exporter: + enabled: true + resources: {{- include "eoapi.resources.small" $ctx | nindent 6 }} + server: + service: + type: ClusterIP + {{- if $persistence }} + persistentVolume: + enabled: true + size: 10Gi + {{- else }} + persistentVolume: + enabled: false + {{- end }} +{{- end -}} diff --git a/charts/eoapi/templates/_resources.yaml b/charts/eoapi/templates/_resources.yaml new file mode 100644 index 00000000..2c4ab5b7 --- /dev/null +++ b/charts/eoapi/templates/_resources.yaml @@ -0,0 +1,51 @@ +{{/* +Common resource definitions to avoid duplication across values files +*/}} + +{{/* +Small resource allocation for lightweight components +*/}} +{{- define "eoapi.resources.small" -}} +limits: + cpu: 10m + memory: 30Mi +requests: + cpu: 10m + memory: 30Mi +{{- end -}} + +{{/* +Medium resource allocation for standard services +*/}} +{{- define "eoapi.resources.medium" -}} +limits: + cpu: 100m + memory: 128Mi +requests: + cpu: 50m + memory: 64Mi +{{- end -}} + +{{/* +Large resource allocation for heavy workloads +*/}} +{{- define "eoapi.resources.large" -}} +limits: + cpu: 500m + memory: 512Mi +requests: + cpu: 250m + memory: 256Mi +{{- end -}} + +{{/* +Grafana specific resources based on observed usage patterns +*/}} +{{- define "eoapi.resources.grafana" -}} +limits: + cpu: 100m + memory: 200Mi +requests: + cpu: 50m + memory: 100Mi +{{- end -}} diff --git a/charts/eoapi-support/templates/dashboard.config.yaml b/charts/eoapi/templates/observability.yaml similarity index 77% rename from charts/eoapi-support/templates/dashboard.config.yaml rename to charts/eoapi/templates/observability.yaml index 6c0f2382..fdf132a2 100644 --- a/charts/eoapi-support/templates/dashboard.config.yaml +++ b/charts/eoapi/templates/observability.yaml @@ -1,3 +1,4 @@ +{{- if .Values.observability.grafana.enabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -7,3 +8,4 @@ metadata: data: kubernetes.json: |- {{ .Files.Get "dashboards/eoAPI-Dashboard.json" | indent 4 }} +{{- end }} diff --git a/charts/eoapi/values.yaml b/charts/eoapi/values.yaml index 2fc60ae0..df73aea0 100644 --- a/charts/eoapi/values.yaml +++ b/charts/eoapi/values.yaml @@ -216,7 +216,7 @@ raster: enabled: true # Control ingress specifically for raster service path: "/raster" # Configurable path prefix for the raster service autoscaling: - # NOTE: to have autoscaling working you'll need to install the `eoapi-support` chart + # NOTE: to have autoscaling working you'll need to enable monitoring # see ../../../docs/autoscaling.md for more information enabled: false minReplicas: 1 @@ -288,7 +288,7 @@ multidim: enabled: true # Control ingress specifically for multidim service path: "/multidim" # Configurable path prefix for the multidim service autoscaling: - # NOTE: to have autoscaling working you'll need to install the `eoapi-support` chart + # NOTE: to have autoscaling working you'll need to enable monitoring # see ../../../docs/autoscaling.md for more information enabled: false minReplicas: 1 @@ -360,7 +360,7 @@ stac: enabled: true # Control ingress specifically for stac service path: "/stac" # Configurable path prefix for the stac service autoscaling: - # NOTE: to have autoscaling working you'll need to install the `eoapi-support` chart + # NOTE: to have autoscaling working you'll need to enable monitoring # see ../../../docs/autoscaling.md for more information enabled: false minReplicas: 1 @@ -420,7 +420,7 @@ vector: enabled: true # Control ingress specifically for vector service path: "/vector" # Configurable path prefix for the vector service autoscaling: - # NOTE: to have autoscaling working you'll need to install the `eoapi-support` chart + # NOTE: to have autoscaling working you'll need to enable monitoring # see ../../../docs/autoscaling.md for more information enabled: false minReplicas: 1 @@ -560,8 +560,6 @@ knative: memory: 128Mi # Knative operator sub-chart configuration -# These values are passed directly to the knative-operator sub-chart -# The operator will be installed and can then deploy Knative Serving/Eventing via CRs knative-operator: tag: "v1.17.8" resources: @@ -573,8 +571,127 @@ knative-operator: memory: 256Mi ###################### -# VERSION MANAGEMENT +# MONITORING ###################### +# Core monitoring components for metrics collection and autoscaling +monitoring: + # Metrics server - essential for HPA functionality + metricsServer: + enabled: false + apiService: + create: true + + # Prometheus - core metrics collection for autoscaling + prometheus: + enabled: false + alertmanager: + enabled: false + prometheus-pushgateway: + enabled: false + kube-state-metrics: + enabled: true + prometheus-node-exporter: + enabled: true + resources: + limits: + cpu: 10m + memory: 30Mi + requests: + cpu: 10m + memory: 30Mi + server: + service: + type: ClusterIP # Internal service, no external exposure by default + + # Prometheus adapter - enables custom HPA metrics + prometheusAdapter: + enabled: false + prometheus: + # URL to Prometheus server - will be auto-configured for same-release Prometheus + # If using external Prometheus, set this to your Prometheus URL + # Example: http://my-prometheus-server.monitoring.svc.cluster.local + url: http://eoapi-prometheus-server.eoapi.svc.cluster.local + port: 80 + path: "" + rules: + default: false + # Custom metrics for eoapi service autoscaling + # Each service gets its own request rate metric for HPA scaling + custom: + # Vector service request rate metric + - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' + seriesFilters: [] + resources: + template: <<.Resource>> + name: + matches: "" + as: "nginx_ingress_controller_requests_rate_vector_eoapi" + metricsQuery: round(sum(rate(<<.Series>>{service="vector",path=~"/vector.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) + + # Raster service request rate metric + - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' + seriesFilters: [] + resources: + template: <<.Resource>> + name: + matches: "" + as: "nginx_ingress_controller_requests_rate_raster_eoapi" + metricsQuery: round(sum(rate(<<.Series>>{service="raster",path=~"/raster.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) + + # STAC service request rate metric + - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' + seriesFilters: [] + resources: + template: <<.Resource>> + name: + matches: "" + as: "nginx_ingress_controller_requests_rate_stac_eoapi" + metricsQuery: round(sum(rate(<<.Series>>{service="stac",path=~"/stac.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) + +###################### +# OBSERVABILITY +###################### +# Grafana dashboards and visualization (requires monitoring.prometheus.enabled=true) +observability: + grafana: + enabled: false + persistence: + enabled: false + service: + type: LoadBalancer + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: "nlb" + service.beta.kubernetes.io/aws-load-balancer-internal: "false" + # Resources consistent with eoapi.resources.grafana template in _resources.yaml + resources: + limits: + cpu: 100m + memory: 200Mi + requests: + cpu: 50m + memory: 100Mi + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + url: "http://{{ .Release.Name }}-prometheus-server" + access: proxy + isDefault: true + dashboardsConfigMaps: + default: "{{ .Release.Name }}-dashboards" + +# Metrics Server sub-chart configuration +# These values are passed directly to the metrics-server sub-chart +metrics-server: + image: + registry: docker.io + repository: bitnamilegacy/metrics-server + tag: "0.8.0-debian-12-r4" + apiService: + create: true + # Version being upgraded from, used for migration purposes # Dont set the value in the values.yaml file # prefer to set it in the command line diff --git a/charts/eoapi/values/monitoring.yaml b/charts/eoapi/values/monitoring.yaml new file mode 100644 index 00000000..519d34bd --- /dev/null +++ b/charts/eoapi/values/monitoring.yaml @@ -0,0 +1,59 @@ +###################### +# MONITORING BASE CONFIG +###################### +# Base monitoring configuration - import in values files with: +# monitoring: !include values/monitoring.yaml + +monitoring: + enabled: true + + # Metrics server for HPA + metricsServer: + enabled: true + apiService: + create: true + resources: &small_resources + limits: + cpu: 10m + memory: 30Mi + requests: + cpu: 10m + memory: 30Mi + + # Prometheus stack + prometheus: + enabled: true + alertmanager: + enabled: false + prometheus-pushgateway: + enabled: false + + kube-state-metrics: + enabled: true + resources: *small_resources + + prometheus-node-exporter: + enabled: true + resources: *small_resources + + server: + service: + type: ClusterIP + persistentVolume: + enabled: false + size: 8Gi + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 200m + memory: 256Mi + +# Autoscaling defaults +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 10 + targetCPUUtilizationPercentage: 80 + targetMemoryUtilizationPercentage: 80 diff --git a/docs/autoscaling.md b/docs/autoscaling.md index f5f76d8a..5bbc4643 100644 --- a/docs/autoscaling.md +++ b/docs/autoscaling.md @@ -1,39 +1,50 @@ --- -title: "Autoscaling & Monitoring" -description: "HPA setup with custom metrics, Grafana dashboards, Prometheus configuration, and load testing" +title: "Autoscaling" +description: "Horizontal Pod Autoscaler (HPA) configuration for eoAPI services." external_links: - name: "eoapi-k8s Repository" url: "https://github.com/developmentseed/eoapi-k8s" - name: "Kubernetes HPA Documentation" url: "https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/" - - name: "Prometheus Documentation" - url: "https://prometheus.io/docs/" - - name: "Grafana Documentation" - url: "https://grafana.com/docs/" --- -# Autoscaling / Monitoring / Observability +# Autoscaling -Autoscaling is both art and science. To test out your application's autoscaling requirements you often need to consider -your data volume, data usage patterns, bottlenecks (such as the database) among many, many other things. Load testing, -metrics, monitoring and observability will help you explore what those needs are. +Horizontal Pod Autoscaler (HPA) configuration for eoAPI services. Autoscaling requires monitoring components to be enabled in the main chart. +## Prerequisites -> ⓘ The `eoapi-support` chart in this repository is required to be installed to -enable any of the eoAPI service autoscaling. It cannot be listed as a dependecy of `eoapi` chart -b/c of the limitations in `prometheus-adapter` and `grafana` for constructing the Prometheus internal -service domains dynamically. +Enable monitoring in your main eoapi installation: -If you are comfortable with k8s you probably only need to `helm install` the support chart and be on your way. Other folks -might want to read through the verbose walkthrough material below to familiarize yourself with how things work. +```yaml +monitoring: + prometheus: + enabled: true + prometheusAdapter: + enabled: true # Required for request-rate scaling + metricsServer: + enabled: true # Required for CPU scaling +``` ---- +## Configuration -## Helm Install `eoapi-support` +### Basic Autoscaling The following instructions assume you've gone through the [AWS](./aws-eks.md) or [GCP](./gcp-gke.md) cluster set up and installed the `eoapi` chart. +```yaml +stac: + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 20 + type: "requestRate" # Options: "cpu", "requestRate", "both" + targets: + requestRate: 50000m # 50 requests/second +``` + +### Scaling Policies 1. Go to the [releases section](https://github.com/developmentseed/eoapi-k8s/releases) of this repository and find the latest `eoapi-support-` version to install, or use the following command to get the latest version: @@ -43,361 +54,395 @@ and installed the `eoapi` chart. export SUPPORT_VERSION=$(helm search repo eoapi/eoapi-support --versions | head -2 | tail -1 | awk '{print $2}') ``` +```yaml +stac: + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 20 + type: "both" + behaviour: + scaleDown: + stabilizationWindowSeconds: 300 # 5min cooldown + policies: + - type: Percent + value: 50 # Max 50% pods removed per period + periodSeconds: 300 + scaleUp: + stabilizationWindowSeconds: 60 # 1min cooldown + policies: + - type: Percent + value: 100 # Max 100% pods added per period + periodSeconds: 60 + targets: + cpu: 70 + requestRate: 50000m +``` + +## Metrics Types + +### CPU-based Scaling +```yaml +type: "cpu" +targets: + cpu: 70 +``` + +### Request Rate Scaling +```yaml +type: "requestRate" +targets: + requestRate: 50000m # 50 requests/second +``` + + +### Combined Scaling +```yaml +type: "both" +targets: + cpu: 70 + requestRate: 100000m # 100 requests/second +``` + +## Custom Metrics Configuration + +When using request rate scaling, the prometheus-adapter needs to be configured to expose custom metrics. This is handled automatically when you enable monitoring in the main chart: + +```yaml +# In your main eoapi values file +ingress: + host: your-domain.com + +monitoring: + prometheusAdapter: + enabled: true + resources: + limits: + cpu: 250m + memory: 256Mi + requests: + cpu: 100m + memory: 128Mi +``` + +## Service-Specific Examples + +### STAC (High throughput) +```yaml +stac: + autoscaling: + enabled: true + minReplicas: 3 + maxReplicas: 20 + type: "requestRate" + targets: + requestRate: 40000m +``` + +### Raster (Resource intensive) +```yaml +raster: + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 8 + type: "cpu" + behaviour: + scaleDown: + stabilizationWindowSeconds: 300 + targets: + cpu: 75 +``` + +### Vector (Balanced) +```yaml +vector: + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 12 + type: "both" + targets: + cpu: 70 + requestRate: 75000m +``` + +## Resource Requirements + +### Autoscaling Components +- **metrics-server**: ~100m CPU, ~300Mi memory per node +- **prometheus-adapter**: ~250m CPU, ~256Mi memory +- **prometheus-server**: ~500m CPU, ~512Mi memory (varies with retention) + +## Verification + +### Check HPA Status + +```bash +# Check HPA status for all services +kubectl get hpa -n eoapi + +# Get detailed HPA information +kubectl describe hpa eoapi-stac -n eoapi +``` + +### Verify Custom Metrics API + +```bash +# Check if custom metrics API is available +kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" | jq . + +# Check specific request rate metrics +kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1/namespaces/eoapi/ingresses/*/requests_per_second" | jq . +``` + +### Check Prometheus Adapter + +```bash +# Check prometheus-adapter logs +kubectl logs -l app.kubernetes.io/name=prometheus-adapter -n eoapi +``` -2. Decide on a release name and `namespace` for your support chart. The next steps assume we've -chosen a release name of `eoapi-support` and a similar namespace of `eoapi-support` +## Load Testing +For load testing your autoscaling setup: -3. Then do a normal `helm install` but you'll want to parameterize and pass overrides for the prometheus URL to include -the release name and namespace chosen above. This allows other third-party dependencies used in the chart -(`prometheus-adpater` and `grafana`) know where to find the prometheus service internally. This is unfortunately a -manual step that cannot be automated +```yaml +ingress: + host: your-test-domain.com +``` +3. Check ingress configuration: ```bash - helm upgrade --install -n eoapi-support \ - --create-namespace eoapi-support eoapi/eoapi-support --version $SUPPORT_VERSION \ - --set prometheus-adapter.prometheus.url='http://eoapi-support-prometheus-server.eoapi-support.svc.cluster.local' \ - --set grafana.datasources.datasources\\.yaml.datasources[0].url='http://eoapi-support-prometheus-server.eoapi-support.svc.cluster.local' - ``` - - -4. verify that everything is set up correctly and no deployments are not failing: - - ```sh - watch -n 1 "kubectl -n eoapi-support get deploy,pod,svc" - NAME READY STATUS RESTARTS AGE - pod/eoapi-support-grafana-7fdc9688dd-wkw7p 1/1 Running 0 79s - pod/eoapi-support-kube-state-metrics-54d75784db-ghgbd 1/1 Running 0 79s - pod/eoapi-support-prometheus-adapter-668b6bd89c-kb25q 1/1 Running 0 79s - pod/eoapi-support-prometheus-node-exporter-6f96z 1/1 Running 0 79s - pod/eoapi-support-prometheus-node-exporter-fr96x 1/1 Running 0 79s - pod/eoapi-support-prometheus-node-exporter-pdvvp 1/1 Running 0 79s - pod/eoapi-support-prometheus-server-76dcfc684b-wmk5c 2/2 Running 0 79s - - NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE - service/eoapi-support-grafana LoadBalancer 10.123.248.75 104.154.59.180 80:30821/TCP 79s - service/eoapi-support-kube-state-metrics ClusterIP 10.123.241.247 8080/TCP 79s - service/eoapi-support-prometheus-adapter ClusterIP 10.123.249.21 443/TCP 79s - service/eoapi-support-prometheus-node-exporter ClusterIP 10.123.249.90 9100/TCP 79s - service/eoapi-support-prometheus-server ClusterIP 10.123.247.255 80/TCP 79s + kubectl get ingress -n eoapi ``` +## Troubleshooting -5. If anything in steps 1 through 3 seems confusing then here is a quick bash script to clear it up: - - ```shell - export RELEASE_NAME=eoapi - export RELEASE_NS=eoapi - export SUPPORT_RELEASE_NAME=eoapi-support - export SUPPORT_RELEASE_NS=eoapi-support - - # Get latest chart versions - export SUPPORT_VERSION=$(helm search repo eoapi/eoapi-support --versions | head -2 | tail -1 | awk '{print $2}') - export EOAPI_VERSION=$(helm search repo eoapi/eoapi --versions | head -2 | tail -1 | awk '{print $2}') - - PROMETHEUS_URL="http://${SUPPORT_RELEASE_NAME}-prometheus-server.${SUPPORT_RELEASE_NS}.svc.cluster.local" - - helm upgrade --install \ - -n $SUPPORT_RELEASE_NS --create-namespace $SUPPORT_RELEASE_NAME \ - eoapi/eoapi-support --version $SUPPORT_VERSION \ - --set prometheus-adapter.prometheus.url=$PROMETHEUS_URL \ - --set grafana.datasources.datasources\\.yaml.datasources[0].url=$PROMETHEUS_URL \ - -f /tmp/values-overrides.yaml - - helm upgrade --install \ - -n $RELEASE_NS --create-namespace $RELEASE_NAME \ - eoapi/eoapi --version $EOAPI_VERSION \ - -f /tmp/support-values-overrides.yaml - ``` - +### HPA Shows "Unknown" Metrics ---- +If HPA shows "unknown" for custom metrics: -### Review [Default Configuration and Options](./configuration.md) - -[This document](./configuration.md) will explain the differences in the `autoscaling` block for each service: - - ```yaml - autoscaling: - enabled: false - minReplicas: 1 - maxReplicas: 10 - # `type`: "cpu" || "requestRate" || "both" - type: "requestRate" - behaviour: {} - scaleDown: - stabilizationWindowSeconds: 60 - scaleUp: - stabilizationWindowSeconds: 0 - targets: - # matches `type` value above unless `type: "both"` is selected - cpu: 85 - requestRate: 15000 - ``` - ---- - -### How Autoscaling Works - -If you grok the default `eoapi-support` values in `values.yaml` you'll see we use custom metrics and prometheus queries -based on the nginx ingress controller's request rate under the `prometheus-adpater.prometheus:` key: - - ```yaml - prometheus-adapter: - prometheus: - # NOTE: the `url` below make some assumptions about the namespace where you released eoapi and prometheus - # 1) that you didn't change the default name of the `prometheus-server` or the port and installed in eoapi namespace - # 2) namely that you ran `helm install eoapi --create-namespace=eoapi` with the `eoapi` namespace - url: http://eoapi-support-prometheus-server.eoapi.svc.cluster.local - port: 80 - path: "" - rules: - default: false - # NOTE: the `name.as` values below make some assumptions about your release name - # namely that you have run `helm install eoapi eoapi/eoapi --create-namespace=eoapi` - custom: - - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' - seriesFilters: [] - resources: - template: <<.Resource>> - name: - matches: "" - as: "nginx_ingress_controller_requests_rate_vector_eoapi" - metricsQuery: round(sum(rate(<<.Series>>{service="vector",path=~"/vector.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) - - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' - seriesFilters: [] - resources: - template: <<.Resource>> - name: - matches: "" - as: "nginx_ingress_controller_requests_rate_raster_eoapi" - metricsQuery: round(sum(rate(<<.Series>>{service="raster",path=~"/raster.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) - - seriesQuery: '{__name__=~"^nginx_ingress_controller_requests$",namespace!=""}' - seriesFilters: [] - resources: - template: <<.Resource>> - name: - matches: "" - as: "nginx_ingress_controller_requests_rate_stac_eoapi" - metricsQuery: round(sum(rate(<<.Series>>{service="stac",path=~"/stac.*",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>), 0.001) - ``` - -Prometheus adapter is a bridge for metrics between Prometheus (which scrapes nginx) and the k8s metrics server so it can autoscale deployments using these custom metrics. -If you've chosen `both` or `requestRate` as a autoscaling `type:` for those values then these custom metrics are used to template an `hpa.yaml` for each service - -### Log into Grafana - -When you `helm install` the support chart you by default get a Grafana dashboard set up with different default metrics charts -to help you load test and explore your service autoscaling. Grafana creates a new username `admin` and password for you -that you'll have to retrieve to login. - -> ⓘ Note that the `service/eoapi-support-grafana` has an EXTERNAL-IP that we can use to view it. -This is just a quick way to work with it. You'll want to set it up with an ingress in the future - - -1. To log into Grafana you'll need to export the default username/password it came installed with. Note that secret names are prefixed -with the `release` name we installed the chart with below `-grafana`: - - ```sh - kubectl get secret eoapi-support-grafana --template='{{index .data "admin-user"}}' -n eoapi | base64 -d - # - kubectl get secret eoapi-support-grafana --template='{{index .data "admin-password"}}' -n eoapi | base64 -d - # - ``` - -2. To find the URL for the load balancer for where to log in with Grafana you can query the services: - - ```sh - kubectl get svc -n eoapi-support +1. Verify prometheus-adapter is running: + ```bash + kubectl get pods -l app.kubernetes.io/name=prometheus-adapter -n eoapi ``` -3. Login and you should be default be able to see the eoapi-k8s grafana dashboard. The Prometheus datasource will already be configured for you: - - ![Grafana Datasource Configuration](./images/datasource.png) - - You can then view the main eoAPI dashboard: - - ![](./images/gfdashboard.png) - - To add additional custom dashboards, you can use the dashboard import functionality: - - ![Adding Custom Grafana Dashboards](./images/add-grafana-dashboard.png) - -### Install or Upgrade Autoscaling Changes to `eoapi` Chart - -1. If you haven't already decide which services (`vector` || `raster` || `stac`) you want to enable `autoscaling` on change your values yaml for these and redeploy - - ```yaml - stac: - enabled: true - autoscaling: - enabled: true - type: "requestRate" - targets: - requestRate: 50000m - settings: - resources: - limits: - cpu: "1280m" - memory: "1536Mi" - requests: - cpu: "512m" - memory: "1024Mi" - vector: - enabled: true - autoscaling: - enabled: true - type: "requestRate" - targets: - requestRate: 50000m - settings: - resources: - limits: - cpu: "768m" - memory: "1536Mi" - requests: - cpu: "256m" - memory: "1024Mi" +2. Check prometheus-adapter logs: + ```bash + kubectl logs -l app.kubernetes.io/name=prometheus-adapter -n eoapi ``` -2. Review what the heck the unit `m` means for your [autoscaling values in the k8s docs](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#quantities) - - -3. Then `helm install` the eoapi chart with these changes - - ```sh - helm upgrade --install -n eoapi... +3. Verify metrics are available in Prometheus: + ```bash + # Port forward to access Prometheus + kubectl port-forward service/eoapi-prometheus-server 9090:80 -n eoapi + # Then check metrics at http://localhost:9090 ``` ---- - -### Add Load Balancer Host as a Host to Your Ingress +### Default Configuration -Unfortunately, nginx will not expose metrics for ingresses without hosts or hosts with wildcards. You'll either need to deploy -`eoapi-k8s` chart again with `ingress.tls.enabled` or need to find the `EXTERNAL-IP` for your `ingress-nginx-controller` and use that -to set up a simple host +Default autoscaling configuration: -1. Find the IP that your `ingress-nginx-controller` service load balancer: +```yaml +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 5 + # Type can be "cpu", "requestRate", or "both" + type: "cpu" + # Custom scaling behavior (optional) + behaviour: {} + # Scaling targets + targets: + # CPU target percentage (when type is "cpu" or "both") + cpu: 80 + # Request rate target in millirequests per second (when type is "requestRate" or "both") + requestRate: 30000m +``` - ```sh - kubectl -n ingress-nginx get svc/ingress-nginx-controller -o=jsonpath='{.status.loadBalancer.ingress[0].hostname}' - http://abc5929f88f8c45c38f6cbab2faad43c-776419634.us-west-2.elb.amazonaws.com/ - ``` +### No Scaling Activity -2. Then live edit your shared ingress for eoapi services to add the host: +If pods aren't scaling: - ```sh - kubectl edit ingress nginx-service-ingress-shared-eoapi -n eoapi +1. Check HPA events: + ```bash + kubectl describe hpa eoapi-stac -n eoapi ``` - ```yaml - # BEFORE - spec: - ingressClassName: nginx - rules: - - http: - paths: - ... +2. Verify metrics are being collected: + ```bash + kubectl top pods -n eoapi ``` - ```yaml - # AFTER - spec: - ingressClassName: nginx - rules: - - host: abc5929f88f8c45c38f6cbab2faad43c-776419634.us-west-2.elb.amazonaws.com - http: - paths: - ... +3. Check resource requests are set: + ```bash + kubectl describe pod eoapi-stac-xxx -n eoapi | grep -A 10 "Requests" ``` -And then finally roll out the deployment. - ```sh - kubectl rollout restart deploy/ingress-nginx-controller -n ingress-nginx - - ``` +### Install or Upgrade Autoscaling Changes to `eoapi` Chart ---- +When enabling autoscaling, ensure monitoring is also enabled: + +```yaml +# Enable monitoring first +monitoring: + prometheus: + enabled: true + prometheusAdapter: + enabled: true + +# Then enable autoscaling +stac: + autoscaling: + enabled: true + type: "requestRate" + targets: + requestRate: 50000m + +# Configure resources for proper scaling metrics +stac: + settings: + resources: + limits: + cpu: 1000m + memory: 512Mi + requests: + cpu: 100m + memory: 128Mi +``` + +### Custom Metrics Not Working + +If request rate metrics aren't working: + +1. Verify nginx ingress controller has metrics enabled +2. Check prometheus is scraping ingress metrics +3. Confirm prometheus-adapter configuration +4. Validate ingress annotations for metrics + +### Scaling Too Aggressive/Slow + +Adjust scaling behavior: + +```yaml +autoscaling: + behaviour: + scaleUp: + stabilizationWindowSeconds: 60 # Faster scaling up + policies: + - type: Percent + value: 100 + periodSeconds: 60 + scaleDown: + stabilizationWindowSeconds: 300 # Slower scaling down + policies: + - type: Percent + value: 25 # More conservative scale down + periodSeconds: 300 +``` + +## Best Practices + +1. **Set appropriate resource requests**: HPA needs resource requests to calculate CPU utilization +2. **Use stabilization windows**: Prevent thrashing with appropriate cooldown periods +3. **Monitor costs**: Autoscaling can increase costs rapidly +4. **Test thoroughly**: Validate scaling behavior under realistic load +5. **Set reasonable limits**: Use `maxReplicas` to prevent runaway scaling +6. **Use multiple metrics**: Combine CPU and request rate for better scaling decisions + +Example ingress configuration for load testing: + +```yaml +# For AWS ALB +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: eoapi-ingress + annotations: + kubernetes.io/ingress.class: alb + alb.ingress.kubernetes.io/scheme: internet-facing +spec: + ingressClassName: nginx + rules: + - host: your-domain.com + http: + paths: [...] + +# For nginx ingress +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: eoapi-ingress +spec: + ingressClassName: nginx + rules: + - host: abc5929f88f8c45c38f6cbab2faad43c-776419634.us-west-2.elb.amazonaws.com + http: + paths: [...] +``` ## Load Testing #### Load Testing with `hey` -Everything mentioned below assumes you've already gone through the autoscaling setup above and -that you're deploying using `ingress.className: "nginx"`. +The `hey` tool is a simple HTTP load testing tool. ### Install and Run Load Tests -1. Install `hey` utility locally: - +1. Install hey: ```bash # macOS brew install hey # Linux - wget https://github.com/rakyll/hey/releases/latest/download/hey_linux_amd64 - chmod +x hey_linux_amd64 && sudo mv hey_linux_amd64 /usr/local/bin/hey + go install github.com/rakyll/hey@latest - # Or use Docker - alias hey='docker run --rm rcmorano/hey' + # Or download from releases + wget https://hey-release.s3.us-east-2.amazonaws.com/hey_linux_amd64 + chmod +x hey_linux_amd64 + sudo mv hey_linux_amd64 /usr/local/bin/hey ``` -2. Find the external IP of your shared nginx ingress: - +2. Run basic load test: ```bash - # For GKE clusters - export INGRESS_ENDPOINT=$(kubectl -n ingress-nginx get ingress/nginx-service-ingress-shared-eoapi -o=jsonpath='{.spec.rules[0].host}') - # Example output: eoapi-35.234.254.12.nip.io + # Test STAC endpoint + hey -z 5m -c 10 https://your-domain.com/stac/collections - # For EKS clusters - export INGRESS_ENDPOINT=$(kubectl -n ingress-nginx get svc/ingress-nginx-controller -o=jsonpath='{.status.loadBalancer.ingress[0].hostname}') - # Example output: k8s-eoapi-ingressn-404721dbb4-e6dec70321c3eddd.elb.us-west-2.amazonaws.com + # Test with higher concurrency + hey -z 10m -c 50 https://your-domain.com/stac/search ``` -3. Run load tests against different endpoints in separate terminals: - +3. Monitor during load test: ```bash - # Test Vector API - hey -n 2000000 -q 150 -c 20 "http://${INGRESS_ENDPOINT}/vector/collections/public.my_data/items?f=geojson" - - # Test STAC API - hey -n 2000000 -q 150 -c 20 "http://${INGRESS_ENDPOINT}/stac/" + # Watch HPA scaling + watch kubectl get hpa -n eoapi - # Test Raster API - hey -n 2000000 -q 150 -c 20 "http://${INGRESS_ENDPOINT}/raster/collections" + # Monitor pods + watch kubectl get pods -n eoapi ``` - **Load testing parameters:** - - `-n`: Total number of requests (2M for sustained testing) - - `-q`: Rate limit (150 requests/second per worker) - - `-c`: Number of concurrent workers (20) - -4. **Monitor autoscaling in Grafana** - Go back to your Grafana dashboard and watch your services autoscale for the endpoints you're hitting: - - ![Grafana Autoscaling Dashboard](./images/grafanaautoscale.png) ### Load Testing Best Practices -- **Start small**: Begin with lower request rates and gradually increase -- **Monitor resources**: Watch CPU, memory, and request rate metrics -- **Test realistic scenarios**: Use actual data access patterns when possible -- **Verify autoscaling**: Ensure HPA triggers and pods scale up/down appropriately -- **Database bottlenecks**: Monitor PostgreSQL performance under load -- **Clean up**: Stop load tests gracefully to avoid overwhelming services +1. **Start small**: Begin with low concurrency and short duration +2. **Monitor resources**: Watch CPU, memory, and network usage +3. **Test realistic scenarios**: Use actual API endpoints and payloads +4. **Gradual increase**: Slowly increase load to find breaking points +5. **Test different endpoints**: Each service may have different characteristics ### Troubleshooting Load Tests -If autoscaling isn't triggering: -- Verify HPA is configured: `kubectl get hpa -n eoapi` -- Check custom metrics: `kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" | jq .` -- Ensure prometheus-adapter is running: `kubectl get pods -n eoapi-support` -- Validate ingress metrics: Check Grafana for nginx request rates +- **High response times**: May indicate need for more replicas or resources +- **Error rates**: Could suggest database bottlenecks or resource limits +- **No scaling**: Check HPA metrics and thresholds ### Advanced Load Testing -For more sophisticated testing consider: -- **[k6](https://k6.io/)** - JavaScript-based load testing with scenarios -- **[Artillery](https://artillery.io/)** - Node.js load testing toolkit -- **[JMeter](https://jmeter.apache.org/)** - GUI-based load testing with complex scenarios +For more comprehensive testing, consider: +- **[Artillery](https://artillery.io/)** - Feature-rich load testing toolkit +- **[k6](https://k6.io/)** - Developer-centric performance testing - **[Locust](https://locust.io/)** - Python-based distributed load testing + +For monitoring and observability setup, see [observability.md](observability.md). diff --git a/docs/observability.md b/docs/observability.md new file mode 100644 index 00000000..afc7b8a1 --- /dev/null +++ b/docs/observability.md @@ -0,0 +1,342 @@ +--- +title: "Observability" +description: "Metrics collection, monitoring, and visualization for eoAPI deployments." +external_links: + - name: "Prometheus Documentation" + url: "https://prometheus.io/docs/" + - name: "Grafana Documentation" + url: "https://grafana.com/docs/" +--- + + # Observability & Monitoring + +This guide covers metrics collection, monitoring, and visualization for eoAPI deployments. All monitoring components are optional and disabled by default. + +## Overview + +eoAPI observability is implemented through conditional dependencies in the main `eoapi` chart: + +### Core Monitoring +Essential metrics collection infrastructure including Prometheus server, metrics-server, kube-state-metrics, node-exporter, and prometheus-adapter. + +### Integrated Observability +Grafana dashboards and visualization tools are available as conditional dependencies within the main chart, eliminating the need for separate deployments. + +## Configuration + +**Prerequisites**: Kubernetes cluster with Helm 3 installed. + +### Quick Deployment + +```bash +# Deploy with monitoring and observability enabled +helm install eoapi eoapi/eoapi \ + --set monitoring.prometheus.enabled=true \ + --set observability.grafana.enabled=true + +# Access Grafana (get password) +kubectl get secret eoapi-grafana -n eoapi \ + -o jsonpath="{.data.admin-password}" | base64 -d +``` + +### Using Configuration Files + +For production deployments, use configuration files instead of command-line flags: + +```bash +# Deploy with integrated monitoring and observability +helm install eoapi eoapi/eoapi -f values-full-observability.yaml +``` + +## Architecture & Components + +**Component Responsibilities:** + +- **Prometheus Server**: Central metrics storage and querying engine +- **metrics-server**: Provides resource metrics for `kubectl top` and HPA +- **kube-state-metrics**: Exposes Kubernetes object state as metrics +- **prometheus-node-exporter**: Collects hardware and OS metrics from nodes +- **prometheus-adapter**: Enables custom metrics for Horizontal Pod Autoscaler +- **Grafana**: Dashboards and visualization of collected metrics + +**Data Flow**: Exporters expose metrics → Prometheus scrapes and stores → Grafana/kubectl query via PromQL → Dashboards visualize data + +### Detailed Configuration + +#### Basic Monitoring Setup + +```yaml +# values.yaml - Enable core monitoring in main eoapi chart +monitoring: + metricsServer: + enabled: true + prometheus: + enabled: true + server: + persistentVolume: + enabled: true + size: 50Gi + retention: "30d" + kube-state-metrics: + enabled: true + prometheus-node-exporter: + enabled: true +``` + +#### Observability Chart Configuration + +```yaml +# Basic Grafana setup +grafana: + enabled: true + service: + type: LoadBalancer + +# Connect to external Prometheus (if not using eoapi's Prometheus) +prometheusUrl: "http://prometheus.monitoring.svc.cluster.local" + +# Production Grafana configuration +grafana: + persistence: + enabled: true + size: 10Gi + resources: + limits: + cpu: 200m + memory: 400Mi + requests: + cpu: 50m + memory: 200Mi +``` + +#### PostgreSQL Monitoring + +Enable PostgreSQL metrics collection: + +```yaml +postgrescluster: + monitoring: true # Enables postgres_exporter sidecar +``` + +## Available Metrics + +### Core Infrastructure Metrics +- **Container resources**: CPU, memory, network usage +- **Kubernetes state**: Pods, services, deployments status +- **Node metrics**: Hardware utilization, filesystem usage +- **PostgreSQL**: Database connections, query performance (when enabled) + +### Custom Application Metrics + +When prometheus-adapter and nginx ingress are both enabled, these custom metrics become available: +- `nginx_ingress_controller_requests_rate_stac_eoapi` +- `nginx_ingress_controller_requests_rate_raster_eoapi` +- `nginx_ingress_controller_requests_rate_vector_eoapi` +- `nginx_ingress_controller_requests_rate_multidim_eoapi` + +**Requirements**: +- nginx ingress controller with prometheus metrics enabled +- Ingress must use specific hostnames (not wildcard patterns) +- prometheus-adapter must be configured to expose these metrics + +## Pre-built Dashboards + +The `eoapi-observability` chart provides ready-to-use dashboards: + +### eoAPI Services Dashboard +- Request rates per service +- Response times and error rates +- Traffic patterns by endpoint + +### Infrastructure Dashboard +- CPU usage rate by pod +- CPU throttling metrics +- Memory usage and limits +- Pod count tracking + +### Container Resources Dashboard +- Resource consumption by container +- Resource quotas and limits +- Performance bottlenecks + +### PostgreSQL Dashboard (when enabled) +- Database connections +- Query performance +- Storage utilization + +#### Production Configuration + +```yaml +monitoring: + prometheus: + server: + # Persistent storage + persistentVolume: + enabled: true + size: 100Gi + storageClass: "gp3" + # Retention policy + retention: "30d" + # Resource allocation + resources: + limits: + cpu: "2000m" + memory: "4096Mi" + requests: + cpu: "1000m" + memory: "2048Mi" + # Security - internal access only + service: + type: ClusterIP +``` + +### Resource Requirements + +#### Core Monitoring Components + +Minimum resource requirements (actual usage varies by cluster size and metrics volume): + +| Component | CPU | Memory | Purpose | +|-----------|-----|---------|----------| +| prometheus-server | 500m | 1Gi | Metrics storage | +| metrics-server | 100m | 200Mi | Resource metrics | +| kube-state-metrics | 50m | 150Mi | K8s state | +| prometheus-node-exporter | 50m | 50Mi | Node metrics | +| prometheus-adapter | 100m | 128Mi | Custom metrics API | +| **Total** | **~800m** | **~1.5Gi** | | + +#### Observability Components + +| Component | CPU | Memory | Purpose | +|-----------|-----|---------|----------| +| grafana | 100m | 200Mi | Visualization | + +## Operations + +### Verification Commands + +```bash +# Check Prometheus is running +kubectl get pods -n eoapi -l app.kubernetes.io/name=prometheus + +# Verify metrics-server +kubectl get apiservice v1beta1.metrics.k8s.io + +# List available custom metrics +kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" | jq '.resources[].name' + +# Test metrics collection +kubectl port-forward svc/eoapi-prometheus-server 9090:80 -n eoapi +# Visit http://localhost:9090/targets +``` + +### Monitoring Health + +```bash +# Check Prometheus targets +curl -X GET 'http://localhost:9090/api/v1/query?query=up' + +# Verify Grafana datasource connectivity +kubectl exec -it deployment/eoapi-obs-grafana -n eoapi -- \ + wget -O- http://eoapi-prometheus-server/api/v1/label/__name__/values +``` + +## Advanced Features + +### Alerting Setup + +Enable alertmanager for alert management: + +```yaml +prometheus: + enabled: true + alertmanager: + enabled: true + config: + global: + # Configure with your SMTP server details + smtp_smarthost: 'your-smtp-server:587' + smtp_from: 'alertmanager@yourdomain.com' + route: + receiver: 'default-receiver' + receivers: + - name: 'default-receiver' + webhook_configs: + - url: 'http://your-webhook-endpoint:5001/' +``` + +**Note**: Replace example values with your actual SMTP server and webhook endpoints. + +### Batch Job Metrics + +Enable pushgateway for batch job metrics: + +```yaml +prometheus: + enabled: true + prometheus-pushgateway: + enabled: true # For batch job metrics collection +``` + +### Custom Dashboards + +Add custom dashboards by creating ConfigMaps with the appropriate label: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: custom-dashboard + namespace: eoapi + labels: + eoapi_dashboard: "1" +data: + custom.json: | + { + "dashboard": { + "id": null, + "title": "Custom eoAPI Dashboard", + "tags": ["eoapi"], + "panels": [] + } + } +``` + +The ConfigMap must be in the same namespace as the Grafana deployment and include the `eoapi_dashboard: "1"` label. + +## Troubleshooting + +### Common Issues + +**Missing Metrics** +1. Check Prometheus service discovery: + ```bash + kubectl port-forward svc/eoapi-prometheus-server 9090:80 -n eoapi + # Visit http://localhost:9090/service-discovery + ``` + +2. Verify target endpoints: + ```bash + kubectl get endpoints -n eoapi + ``` + +**Grafana Connection Issues** +1. Check datasource connectivity in Grafana UI → Configuration → Data Sources +2. Verify Prometheus URL accessibility from Grafana pod + +**Resource Issues** +- Monitor current usage: `kubectl top pods -n eoapi` +- Check for OOMKilled containers: `kubectl describe pods -n eoapi | grep -A 5 "Last State"` +- Verify resource limits are appropriate for your workload size +- Consider increasing Prometheus retention settings if storage is full + +## Security Considerations + +- **Network Security**: Use `ClusterIP` services for Prometheus in production +- **Access Control**: Configure network policies to restrict metrics access +- **Authentication**: Enable authentication for Grafana (LDAP, OAuth, etc.) +- **Data Privacy**: Consider metrics data sensitivity and retention policies + +## Related Documentation + +- For autoscaling configuration using these metrics: [autoscaling.md](autoscaling.md) diff --git a/mkdocs.yml b/mkdocs.yml index 9cbcf82d..2e262ee6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -35,7 +35,8 @@ nav: - Azure AKS: azure.md - Operations: - Data Management: manage-data.md - - Autoscaling & Monitoring: autoscaling.md + - Autoscaling: autoscaling.md + - Observability: observability.md - Authentication: - STAC Auth Proxy: stac-auth-proxy.md - Contributing: