From 789b445740a6492fb6a48667e2befb0063921305 Mon Sep 17 00:00:00 2001
From: Stef Nestor <26751266+stefnestor@users.noreply.github.com>
Date: Fri, 9 Aug 2024 18:18:57 -0600
Subject: [PATCH 1/9] (Docs+) Flush out Resource+Task troubleshooting

---
 docs/reference/esql/task-management.asciidoc  |  2 +-
 .../modules/indices/circuit_breaker.asciidoc  |  3 +-
 docs/reference/tab-widgets/cpu-usage.asciidoc | 49 +++++++++---------
 .../transform/troubleshooting.asciidoc        |  2 +-
 .../common-issues/high-cpu-usage.asciidoc     | 26 ++++++++--
 .../common-issues/hotspotting.asciidoc        |  2 +-
 .../common-issues/rejected-requests.asciidoc  | 50 ++++++++++++++++---
 .../common-issues/task-queue-backlog.asciidoc | 42 ++++++++++------
 8 files changed, 122 insertions(+), 54 deletions(-)

diff --git a/docs/reference/esql/task-management.asciidoc b/docs/reference/esql/task-management.asciidoc
index dfaff96123035..afb7506b0f09a 100644
--- a/docs/reference/esql/task-management.asciidoc
+++ b/docs/reference/esql/task-management.asciidoc
@@ -9,7 +9,7 @@ You can list running {esql} queries with the <<tasks,task management API>>:
 
 [source,console,id=esql-task-management-get-all]
 ----
-GET /_tasks?pretty&detailed&group_by=parents&human&actions=*data/read/esql
+GET /_tasks?pretty=true&human=true&detailed=true&group_by=parents&actions=*data/read/esql
 ----
 
 Which returns a list of statuses like this:
diff --git a/docs/reference/modules/indices/circuit_breaker.asciidoc b/docs/reference/modules/indices/circuit_breaker.asciidoc
index a5a787e23d170..452d4e99704ce 100644
--- a/docs/reference/modules/indices/circuit_breaker.asciidoc
+++ b/docs/reference/modules/indices/circuit_breaker.asciidoc
@@ -175,7 +175,8 @@ an `OutOfMemory` exception which would bring down the node.
 To prevent this from happening, a special <<circuit-breaker, circuit breaker>> is used,
 which limits the memory allocation during the execution of a <<eql-sequences, sequence>>
 query. When the breaker is triggered, an `org.elasticsearch.common.breaker.CircuitBreakingException`
-is thrown and a descriptive error message is returned to the user.
+is thrown and a descriptive error message including `circuit_breaking_exception`
+is returned to the user.
 
 This <<circuit-breaker, circuit breaker>> can be configured using the following settings:
 
diff --git a/docs/reference/tab-widgets/cpu-usage.asciidoc b/docs/reference/tab-widgets/cpu-usage.asciidoc
index 575cf459ee5be..8ba1fbc60e81e 100644
--- a/docs/reference/tab-widgets/cpu-usage.asciidoc
+++ b/docs/reference/tab-widgets/cpu-usage.asciidoc
@@ -1,30 +1,29 @@
 // tag::cloud[]
-From your deployment menu, click **Performance**. The page's **CPU Usage** chart
-shows your deployment's CPU usage as a percentage.
-
-High CPU usage can also deplete your CPU credits. CPU credits let {ess} provide
-smaller clusters with a performance boost when needed. The **CPU credits**
-chart shows your remaining CPU credits, measured in seconds of CPU time.
-
-You can also use the <<cat-nodes,cat nodes API>> to get the current CPU usage
-for each node.
-
-// tag::cpu-usage-cat-nodes[]
-[source,console]
-----
-GET _cat/nodes?v=true&s=cpu:desc
-----
-
-The response's `cpu` column contains the current CPU usage as a percentage. The
-`name` column contains the node's name.
-// end::cpu-usage-cat-nodes[]
-
+* (Recommended) Enabling {cloud}/ec-monitoring-setup.html[Logs and Metrics]. Data will then
+report under {kib}'s {kibana-ref}/xpack-monitoring.html[Stack Monitoring]. We
+recommend enabling its {kibana-ref}/kibana-alerts.html[CPU Usage Threshold Alert]
+to be proactively notified about potential issues.
+
+* From your deployment menu, clicking into
+{cloud}/ec-saas-metrics-accessing.html[**Performance**]. This page's **CPU
+Usage** chart shows your deployment's CPU usage as a percentage. The page's
+**CPU credits** chart shows your remaining CPU credits, measured in seconds of
+CPU time.
+
+{ess} grants {cloud}/ec-vcpu-boost-instance.html[CPU credits] per deployment
+to provide smaller clusters with performance boosts when needed. High CPU
+usage can deplete these credits which may lead to symptoms like:
+
+* {cloud}/ec-scenario_why_is_performance_degrading_over_time.html[Why is
+performance degrading over time?].
+
+* {cloud}/ec-scenario_why_are_my_cluster_response_times_suddenly_so_much_worse.html[Why
+are my cluster response times suddenly so much worse?]
 // end::cloud[]
 
 // tag::self-managed[]
-
-Use the <<cat-nodes,cat nodes API>> to get the current CPU usage for each node.
-
-include::cpu-usage.asciidoc[tag=cpu-usage-cat-nodes]
-
+* Enabling <<monitoring-overview,{es} Monitoring>>. Data will then
+report under {kib}'s {kibana-ref}/xpack-monitoring.html[Stack Monitoring]. We
+recommend enabling its {kibana-ref}/kibana-alerts.html[CPU Usage Threshold Alert]
+to be proactively notified about potential issues.
 // end::self-managed[]
diff --git a/docs/reference/transform/troubleshooting.asciidoc b/docs/reference/transform/troubleshooting.asciidoc
index 24abed46048fb..e5109275bd62d 100644
--- a/docs/reference/transform/troubleshooting.asciidoc
+++ b/docs/reference/transform/troubleshooting.asciidoc
@@ -20,7 +20,7 @@ by your `transform_id`.
 information about the {transform} status and failures.
 * If the {transform} exists as a task, you can use the
 <<tasks,task management API>> to gather task information. For example:
-`GET _tasks?actions=data_frame/transforms*&detailed`. Typically, the task exists
+`GET _tasks?actions=data_frame/transforms*&detailed=true`. Typically, the task exists
 when the {transform} is in a started or failed state.
 * The {es} logs from the node that was running the {transform} might
 also contain useful information. You can identify the node from the notification
diff --git a/docs/reference/troubleshooting/common-issues/high-cpu-usage.asciidoc b/docs/reference/troubleshooting/common-issues/high-cpu-usage.asciidoc
index 858683ef97a6d..11d6a39127d60 100644
--- a/docs/reference/troubleshooting/common-issues/high-cpu-usage.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/high-cpu-usage.asciidoc
@@ -9,12 +9,30 @@ If a thread pool is depleted, {es} will <<rejected-requests,reject requests>>
 related to the thread pool. For example, if the `search` thread pool is
 depleted, {es} will reject search requests until more threads are available.
 
+CPU degradation frequently occurs related to a <<data-tiers,data tier>>'s traffic,
+potentially being <<hotspotting,hot spotted>>.
+
 [discrete]
 [[diagnose-high-cpu-usage]]
 ==== Diagnose high CPU usage
 
 **Check CPU usage**
 
+Current CPU usage per node can be polled from the <<cat-nodes,cat nodes API>>:
+
+// tag::cpu-usage-cat-nodes[]
+[source,console]
+----
+GET _cat/nodes?v=true&s=cpu:desc
+----
+
+The response's `cpu` column contains the current CPU usage as a percentage.
+The `name` column contains the node's name. Elevated but transient `cpu` is
+normal, but if `cpu` is elevated for an extended duration it should be
+investigated.
+
+To track CPU usage over time, we recommend enabling monitoring:
+
 include::{es-ref-dir}/tab-widgets/cpu-usage-widget.asciidoc[]
 
 **Check hot threads**
@@ -24,11 +42,13 @@ threads API>> to check for resource-intensive threads running on the node.
 
 [source,console]
 ----
-GET _nodes/my-node,my-other-node/hot_threads
+GET _nodes/hot_threads
 ----
 // TEST[s/\/my-node,my-other-node//]
 
-This API returns a breakdown of any hot threads in plain text.
+This API returns a breakdown of any hot threads in plain text. High CPU usage
+frequently correlates to <<task-queue-backlog,particular tasks and/or their
+backlog>>.
 
 [discrete]
 [[reduce-cpu-usage]]
@@ -56,7 +76,7 @@ for these searches, use the <<tasks,task management API>>.
 
 [source,console]
 ----
-GET _tasks?actions=*search&detailed
+GET _tasks?actions=*search&detailed=true
 ----
 
 The response's `description` contains the search request and its queries.
diff --git a/docs/reference/troubleshooting/common-issues/hotspotting.asciidoc b/docs/reference/troubleshooting/common-issues/hotspotting.asciidoc
index a8ca4c7d851d1..236359e9845d0 100644
--- a/docs/reference/troubleshooting/common-issues/hotspotting.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/hotspotting.asciidoc
@@ -263,7 +263,7 @@ further insight on it via <<tasks,the task management API>>,
 
 [source,console]
 ----
-GET _tasks?human&detailed
+GET _tasks?pretty=true&human=true&detailed=true
 ----
 
 Its response contains a `description` that reports this query:
diff --git a/docs/reference/troubleshooting/common-issues/rejected-requests.asciidoc b/docs/reference/troubleshooting/common-issues/rejected-requests.asciidoc
index 497bddc562c69..e01420b0d5633 100644
--- a/docs/reference/troubleshooting/common-issues/rejected-requests.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/rejected-requests.asciidoc
@@ -23,9 +23,50 @@ To check the number of rejected tasks for each thread pool, use the
 
 [source,console]
 ----
-GET /_cat/thread_pool?v=true&h=id,name,active,rejected,completed
+GET /_cat/thread_pool?v=true&h=id,name,queue,active,rejected,completed
 ----
 
+The `write` thread pool rejections frequently surface in the erring API and
+correlating log as `EsRejectedExecutionException` with either
+`QueueResizingEsThreadPoolExecutor` or `queue capacity`.
+
+This frequently relates to <<task-queue-backlog,backlogged tasks>>
+
+[discrete]
+[[check-circuit-breakers]]
+==== Check circuit breakers
+
+To check the number of tripped <<circuit-breaker,circuit breakers>>, use the
+<<cluster-nodes-stats,node stats API>>.
+
+[source,console]
+----
+GET /_nodes/stats/breaker
+----
+
+These statistics are cumulative from node start up. For more information, see
+<<circuit_breaker,circuit breaker errors>>.
+
+[discrete]
+[[check-indexing-pressure]]
+==== Check indexing pressure
+
+To check the number of <<index-modules-indexing-pressure,indexing pressure>>
+rejections, use the <<cluster-nodes-stats,node stats API>>
+
+[source,console]
+----
+GET _nodes/stats?human=true&filter_path=nodes.*.indexing_pressure
+----
+
+The statistics are cumulative from node start up. Related API errors would
+include `EsRejectedExecutionException` sub sections calling out rejected due
+to `coordinating_and_primary_bytes`, `coordinating`, `primary`, or `replica`.
+
+This frequently relates to <<task-queue-backlog,backlogged tasks>>,
+<<docs-bulk,bulk index>> sizing, and/or the ingest target's
+<<index-modules,`refresh_interval` setting>>.
+
 [discrete]
 [[prevent-rejected-requests]]
 ==== Prevent rejected requests
@@ -34,9 +75,4 @@ GET /_cat/thread_pool?v=true&h=id,name,active,rejected,completed
 
 If {es} regularly rejects requests and other tasks, your cluster likely has high
 CPU usage or high JVM memory pressure. For tips, see <<high-cpu-usage>> and
-<<high-jvm-memory-pressure>>.
-
-**Prevent circuit breaker errors**
-
-If you regularly trigger circuit breaker errors, see <<circuit-breaker-errors>>
-for tips on diagnosing and preventing them.
\ No newline at end of file
+<<high-jvm-memory-pressure>>.
\ No newline at end of file
diff --git a/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc b/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
index 1ff5bf2e5c311..78b9bc6aebd25 100644
--- a/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
@@ -1,10 +1,10 @@
 [[task-queue-backlog]]
 === Task queue backlog
 
-A backlogged task queue can prevent tasks from completing and 
-put the cluster into an unhealthy state. 
-Resource constraints, a large number of tasks being triggered at once,
-and long running tasks can all contribute to a backlogged task queue.
+A backlogged task queue can prevent tasks from completing and put the cluster
+into an unhealthy state. Resource constraints, a large number of tasks being
+triggered at once, and long running tasks can all contribute to a backlogged
+task queue.
 
 [discrete]
 [[diagnose-task-queue-backlog]]
@@ -12,23 +12,28 @@ and long running tasks can all contribute to a backlogged task queue.
 
 **Check the thread pool status**
 
-A <<high-cpu-usage,depleted thread pool>> can result in <<rejected-requests,rejected requests>>. 
+A <<high-cpu-usage,depleted thread pool>> can result in
+<<rejected-requests,rejected requests>>. This may surface restricted to a
+<<data-tiers,data tier>>'s traffic, potentially with <<hotspotting,hot spotting>>
+symptoms.
 
-You can use the <<cat-thread-pool,cat thread pool API>> to 
-see the number of active threads in each thread pool and
-how many tasks are queued, how many have been rejected, and how many have completed. 
+You can use the <<cat-thread-pool,cat thread pool API>> to see the number of
+active threads in each thread pool and how many tasks are queued, how many
+have been rejected, and how many have completed.
 
 [source,console]
 ----
 GET /_cat/thread_pool?v&s=t,n&h=type,name,node_name,active,queue,rejected,completed
 ----
 
+The `active` and `queue` statistics are instantaneous while the `rejected` and
+`completed` statistics are cumulative from node start up.
+
 **Inspect the hot threads on each node**
 
-If a particular thread pool queue is backed up, 
-you can periodically poll the <<cluster-nodes-hot-threads,Nodes hot threads>> API 
-to determine if the thread has sufficient 
-resources to progress and gauge how quickly it is progressing.
+If a particular thread pool queue is backed up, you can periodically poll the
+<<cluster-nodes-hot-threads,Nodes hot threads>> API to determine if the thread
+has sufficient resources to progress and gauge how quickly it is progressing.
 
 [source,console]
 ----
@@ -37,15 +42,22 @@ GET /_nodes/hot_threads
 
 **Look for long running tasks**
 
-Long-running tasks can also cause a backlog. 
-You can use the <<tasks,task management>> API to get information about the tasks that are running. 
-Check the `running_time_in_nanos` to identify tasks that are taking an excessive amount of time to complete. 
+Long-running tasks can also cause a backlog. You can use the <<tasks,task
+management>> API to get information about the node tasks that are running.
+Check the `running_time_in_nanos` to identify tasks that are taking an
+excessive amount of time to complete.
 
 [source,console]
 ----
 GET /_tasks?filter_path=nodes.*.tasks
 ----
 
+Back up may also surface as a delay in synchronizing the cluster state. You
+can use the <<cat-pending-tasks,cat pending tasks API>> to get information
+about the pending cluster state sync tasks that are running. Check the
+`timeInQueue` to identify tasks that are taking an excessive amount of time
+to complete.
+
 [discrete]
 [[resolve-task-queue-backlog]]
 ==== Resolve a task queue backlog

From b38bcec70e76f8f5f1fe691c9fe4f56808d3f8e5 Mon Sep 17 00:00:00 2001
From: Stef Nestor <26751266+stefnestor@users.noreply.github.com>
Date: Fri, 9 Aug 2024 18:36:15 -0600
Subject: [PATCH 2/9] add taskQueueBacklog most common task commands

---
 .../common-issues/task-queue-backlog.asciidoc | 26 +++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc b/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
index 78b9bc6aebd25..1ec20e9822869 100644
--- a/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
@@ -40,7 +40,7 @@ has sufficient resources to progress and gauge how quickly it is progressing.
 GET /_nodes/hot_threads
 ----
 
-**Look for long running tasks**
+**Look for long running node tasks**
 
 Long-running tasks can also cause a backlog. You can use the <<tasks,task
 management>> API to get information about the node tasks that are running.
@@ -49,9 +49,31 @@ excessive amount of time to complete.
 
 [source,console]
 ----
-GET /_tasks?filter_path=nodes.*.tasks
+GET /_tasks?pretty=true&human=true&detailed=true
 ----
 
+If a particular `action` is suspected, you can filter in further. Most common are: 
+
+* <<docs-bulk,bulk index>> related
++
+[source,console]
+----
+GET /_tasks?pretty=true&human=true&detailed=true&actions=indices:data/write/bulk
+----
+
+* search related
++
++
+[source,console]
+----
+GET /_tasks?pretty=true&human=true&detailed=true&actions=indices:data/write/search
+----
+
+Note the API response may contain tasks columns `description` and `header` 
+which enable futher diagnosising task parameters, target, and requestor. 
+
+**Look for long running cluster tasks**
+
 Back up may also surface as a delay in synchronizing the cluster state. You
 can use the <<cat-pending-tasks,cat pending tasks API>> to get information
 about the pending cluster state sync tasks that are running. Check the

From a3cd4c9b72c921c1707bc0671adc6927b7051603 Mon Sep 17 00:00:00 2001
From: Stef Nestor <26751266+stefnestor@users.noreply.github.com>
Date: Fri, 9 Aug 2024 18:41:54 -0600
Subject: [PATCH 3/9] typo

---
 .../common-issues/task-queue-backlog.asciidoc       | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc b/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
index 1ec20e9822869..9728c914e4d4c 100644
--- a/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
@@ -63,7 +63,6 @@ GET /_tasks?pretty=true&human=true&detailed=true&actions=indices:data/write/bulk
 
 * search related
 +
-+
 [source,console]
 ----
 GET /_tasks?pretty=true&human=true&detailed=true&actions=indices:data/write/search
@@ -76,9 +75,15 @@ which enable futher diagnosising task parameters, target, and requestor.
 
 Back up may also surface as a delay in synchronizing the cluster state. You
 can use the <<cat-pending-tasks,cat pending tasks API>> to get information
-about the pending cluster state sync tasks that are running. Check the
-`timeInQueue` to identify tasks that are taking an excessive amount of time
-to complete.
+about the pending cluster state sync tasks that are running. 
+
+[source,console]
+----
+GET /_cat/pending_tasks?v=true
+----
+
+Check the `timeInQueue` to identify tasks that are taking an excessive amount 
+of time to complete.
 
 [discrete]
 [[resolve-task-queue-backlog]]

From a793b09e0892eff4e1d34c5104eac1f8117d739e Mon Sep 17 00:00:00 2001
From: shainaraskas <58563081+shainaraskas@users.noreply.github.com>
Date: Mon, 12 Aug 2024 15:28:21 -0400
Subject: [PATCH 4/9] Update
 docs/reference/troubleshooting/common-issues/rejected-requests.asciidoc

---
 .../troubleshooting/common-issues/rejected-requests.asciidoc    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/reference/troubleshooting/common-issues/rejected-requests.asciidoc b/docs/reference/troubleshooting/common-issues/rejected-requests.asciidoc
index e01420b0d5633..53bfcd7dba1e8 100644
--- a/docs/reference/troubleshooting/common-issues/rejected-requests.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/rejected-requests.asciidoc
@@ -45,7 +45,7 @@ GET /_nodes/stats/breaker
 ----
 
 These statistics are cumulative from node start up. For more information, see
-<<circuit_breaker,circuit breaker errors>>.
+<<circuit-breaker,circuit breaker errors>>.
 
 [discrete]
 [[check-indexing-pressure]]

From 7fe92b41dc2d98e213a08fdc35b071cbb01efe82 Mon Sep 17 00:00:00 2001
From: Stef Nestor <26751266+stefnestor@users.noreply.github.com>
Date: Fri, 16 Aug 2024 15:07:21 -0600
Subject: [PATCH 5/9] feedback

Co-authored-by: shainaraskas <58563081+shainaraskas@users.noreply.github.com>
---
 docs/reference/tab-widgets/cpu-usage.asciidoc | 29 +++++++------------
 .../common-issues/high-cpu-usage.asciidoc     | 13 ++++-----
 .../common-issues/rejected-requests.asciidoc  | 18 +++++++-----
 .../common-issues/task-queue-backlog.asciidoc | 13 ++++-----
 4 files changed, 32 insertions(+), 41 deletions(-)

diff --git a/docs/reference/tab-widgets/cpu-usage.asciidoc b/docs/reference/tab-widgets/cpu-usage.asciidoc
index 8ba1fbc60e81e..c6272228965eb 100644
--- a/docs/reference/tab-widgets/cpu-usage.asciidoc
+++ b/docs/reference/tab-widgets/cpu-usage.asciidoc
@@ -1,29 +1,20 @@
 // tag::cloud[]
-* (Recommended) Enabling {cloud}/ec-monitoring-setup.html[Logs and Metrics]. Data will then
-report under {kib}'s {kibana-ref}/xpack-monitoring.html[Stack Monitoring]. We
-recommend enabling its {kibana-ref}/kibana-alerts.html[CPU Usage Threshold Alert]
-to be proactively notified about potential issues.
+* (Recommended) Enable {cloud}/ec-monitoring-setup.html[logs and metrics]. When logs and metrics are enabled, monitoring information is visible on {kib}'s {kibana-ref}/xpack-monitoring.html[Stack Monitoring] page. 
++
+You can also enable the {kibana-ref}/kibana-alerts.html[CPU usage threshold alert] to be notified about potential issues through email.
 
-* From your deployment menu, clicking into
-{cloud}/ec-saas-metrics-accessing.html[**Performance**]. This page's **CPU
-Usage** chart shows your deployment's CPU usage as a percentage. The page's
-**CPU credits** chart shows your remaining CPU credits, measured in seconds of
-CPU time.
+* From your deployment menu, view the {cloud}/ec-saas-metrics-accessing.html[**Performance**] page. On this page, you can view two key metrics:
+** **CPU usage**: Your deployment's CPU usage, represented as a percentage.
+** **CPU credits**: Your remaining CPU credits, measured in seconds of CPU time.
 
 {ess} grants {cloud}/ec-vcpu-boost-instance.html[CPU credits] per deployment
 to provide smaller clusters with performance boosts when needed. High CPU
-usage can deplete these credits which may lead to symptoms like:
+usage can deplete these credits, which might lead to {cloud}/ec-scenario_why_is_performance_degrading_over_time.html[performance degradation] and {cloud}/ec-scenario_why_are_my_cluster_response_times_suddenly_so_much_worse.html[increased cluster response times].
 
-* {cloud}/ec-scenario_why_is_performance_degrading_over_time.html[Why is
-performance degrading over time?].
-
-* {cloud}/ec-scenario_why_are_my_cluster_response_times_suddenly_so_much_worse.html[Why
-are my cluster response times suddenly so much worse?]
 // end::cloud[]
 
 // tag::self-managed[]
-* Enabling <<monitoring-overview,{es} Monitoring>>. Data will then
-report under {kib}'s {kibana-ref}/xpack-monitoring.html[Stack Monitoring]. We
-recommend enabling its {kibana-ref}/kibana-alerts.html[CPU Usage Threshold Alert]
-to be proactively notified about potential issues.
+* Enable <<monitoring-overview,{es} monitoring>>. When logs and metrics are enabled, monitoring information is visible on {kib}'s {kibana-ref}/xpack-monitoring.html[Stack Monitoring] page.
++
+You can also enable the {kibana-ref}/kibana-alerts.html[CPU usage threshold alert] to be notified about potential issues through email.
 // end::self-managed[]
diff --git a/docs/reference/troubleshooting/common-issues/high-cpu-usage.asciidoc b/docs/reference/troubleshooting/common-issues/high-cpu-usage.asciidoc
index 11d6a39127d60..5b7a68e27a9a5 100644
--- a/docs/reference/troubleshooting/common-issues/high-cpu-usage.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/high-cpu-usage.asciidoc
@@ -9,8 +9,7 @@ If a thread pool is depleted, {es} will <<rejected-requests,reject requests>>
 related to the thread pool. For example, if the `search` thread pool is
 depleted, {es} will reject search requests until more threads are available.
 
-CPU degradation frequently occurs related to a <<data-tiers,data tier>>'s traffic,
-potentially being <<hotspotting,hot spotted>>.
+You might experience high CPU usage if a <<data-tiers,data tier>>, and therefore the nodes assigned to that tier, is experiencing more traffic than other tiers. This imbalance in resource utilization is also known as <<hotspotting,hot spotting>>.
 
 [discrete]
 [[diagnose-high-cpu-usage]]
@@ -18,7 +17,7 @@ potentially being <<hotspotting,hot spotted>>.
 
 **Check CPU usage**
 
-Current CPU usage per node can be polled from the <<cat-nodes,cat nodes API>>:
+You can check the CPU usage per node using the <<cat-nodes,cat nodes API>>:
 
 // tag::cpu-usage-cat-nodes[]
 [source,console]
@@ -27,8 +26,8 @@ GET _cat/nodes?v=true&s=cpu:desc
 ----
 
 The response's `cpu` column contains the current CPU usage as a percentage.
-The `name` column contains the node's name. Elevated but transient `cpu` is
-normal, but if `cpu` is elevated for an extended duration it should be
+The `name` column contains the node's name. Elevated but transient CPU usage is
+normal. However, if CPU usage is elevated for an extended duration, it should be
 investigated.
 
 To track CPU usage over time, we recommend enabling monitoring:
@@ -47,8 +46,8 @@ GET _nodes/hot_threads
 // TEST[s/\/my-node,my-other-node//]
 
 This API returns a breakdown of any hot threads in plain text. High CPU usage
-frequently correlates to <<task-queue-backlog,particular tasks and/or their
-backlog>>.
+frequently correlates to <<task-queue-backlog,a long-running task, or a
+backlog of tasks>>.
 
 [discrete]
 [[reduce-cpu-usage]]
diff --git a/docs/reference/troubleshooting/common-issues/rejected-requests.asciidoc b/docs/reference/troubleshooting/common-issues/rejected-requests.asciidoc
index 53bfcd7dba1e8..036eff25ddbae 100644
--- a/docs/reference/troubleshooting/common-issues/rejected-requests.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/rejected-requests.asciidoc
@@ -26,11 +26,11 @@ To check the number of rejected tasks for each thread pool, use the
 GET /_cat/thread_pool?v=true&h=id,name,queue,active,rejected,completed
 ----
 
-The `write` thread pool rejections frequently surface in the erring API and
+`write` thread pool rejections frequently appear in the erring API and
 correlating log as `EsRejectedExecutionException` with either
 `QueueResizingEsThreadPoolExecutor` or `queue capacity`.
 
-This frequently relates to <<task-queue-backlog,backlogged tasks>>
+These errors are often related to <<task-queue-backlog,backlogged tasks>>.
 
 [discrete]
 [[check-circuit-breakers]]
@@ -44,7 +44,7 @@ To check the number of tripped <<circuit-breaker,circuit breakers>>, use the
 GET /_nodes/stats/breaker
 ----
 
-These statistics are cumulative from node start up. For more information, see
+These statistics are cumulative from node startup. For more information, see
 <<circuit-breaker,circuit breaker errors>>.
 
 [discrete]
@@ -52,19 +52,21 @@ These statistics are cumulative from node start up. For more information, see
 ==== Check indexing pressure
 
 To check the number of <<index-modules-indexing-pressure,indexing pressure>>
-rejections, use the <<cluster-nodes-stats,node stats API>>
+rejections, use the <<cluster-nodes-stats,node stats API>>.
 
 [source,console]
 ----
 GET _nodes/stats?human=true&filter_path=nodes.*.indexing_pressure
 ----
 
-The statistics are cumulative from node start up. Related API errors would
-include `EsRejectedExecutionException` sub sections calling out rejected due
+These stats are cumulative from node startup. 
+
+Indexing pressure rejections appear as an
+`EsRejectedExecutionException`, and indicate that they were rejected due
 to `coordinating_and_primary_bytes`, `coordinating`, `primary`, or `replica`.
 
-This frequently relates to <<task-queue-backlog,backlogged tasks>>,
-<<docs-bulk,bulk index>> sizing, and/or the ingest target's
+These errors are often related to <<task-queue-backlog,backlogged tasks>>,
+<<docs-bulk,bulk index>> sizing, or the ingest target's
 <<index-modules,`refresh_interval` setting>>.
 
 [discrete]
diff --git a/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc b/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
index 9728c914e4d4c..e9b6ad1620401 100644
--- a/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
@@ -27,7 +27,7 @@ GET /_cat/thread_pool?v&s=t,n&h=type,name,node_name,active,queue,rejected,comple
 ----
 
 The `active` and `queue` statistics are instantaneous while the `rejected` and
-`completed` statistics are cumulative from node start up.
+`completed` statistics are cumulative from node startup.
 
 **Inspect the hot threads on each node**
 
@@ -52,28 +52,27 @@ excessive amount of time to complete.
 GET /_tasks?pretty=true&human=true&detailed=true
 ----
 
-If a particular `action` is suspected, you can filter in further. Most common are: 
+If a particular `action` is suspected, you can filter the tasks further. The most common long-running tasks are <<docs-bulk,bulk index>>- or search-related.
 
-* <<docs-bulk,bulk index>> related
+* Filter for <<docs-bulk,bulk index>> actions:
 +
 [source,console]
 ----
 GET /_tasks?pretty=true&human=true&detailed=true&actions=indices:data/write/bulk
 ----
 
-* search related
+* Filter for search actions:
 +
 [source,console]
 ----
 GET /_tasks?pretty=true&human=true&detailed=true&actions=indices:data/write/search
 ----
 
-Note the API response may contain tasks columns `description` and `header` 
-which enable futher diagnosising task parameters, target, and requestor. 
+The API response may contain additional tasks columns, including `description` and `header`, which provides the task parameters, target, and requestor. You can use this information to perform further diagnosis.
 
 **Look for long running cluster tasks**
 
-Back up may also surface as a delay in synchronizing the cluster state. You
+A task backlog might also appear as a delay in synchronizing the cluster state. You
 can use the <<cat-pending-tasks,cat pending tasks API>> to get information
 about the pending cluster state sync tasks that are running. 
 

From 02befb33107a93748c0e375b10eb1095221ac31a Mon Sep 17 00:00:00 2001
From: Stef Nestor <26751266+stefnestor@users.noreply.github.com>
Date: Fri, 16 Aug 2024 15:10:12 -0600
Subject: [PATCH 6/9] feedback

---
 docs/reference/esql/task-management.asciidoc                  | 2 +-
 docs/reference/transform/troubleshooting.asciidoc             | 2 +-
 .../troubleshooting/common-issues/high-cpu-usage.asciidoc     | 2 +-
 .../troubleshooting/common-issues/hotspotting.asciidoc        | 2 +-
 .../troubleshooting/common-issues/task-queue-backlog.asciidoc | 4 ++--
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/reference/esql/task-management.asciidoc b/docs/reference/esql/task-management.asciidoc
index afb7506b0f09a..dfaff96123035 100644
--- a/docs/reference/esql/task-management.asciidoc
+++ b/docs/reference/esql/task-management.asciidoc
@@ -9,7 +9,7 @@ You can list running {esql} queries with the <<tasks,task management API>>:
 
 [source,console,id=esql-task-management-get-all]
 ----
-GET /_tasks?pretty=true&human=true&detailed=true&group_by=parents&actions=*data/read/esql
+GET /_tasks?pretty&detailed&group_by=parents&human&actions=*data/read/esql
 ----
 
 Which returns a list of statuses like this:
diff --git a/docs/reference/transform/troubleshooting.asciidoc b/docs/reference/transform/troubleshooting.asciidoc
index e5109275bd62d..24abed46048fb 100644
--- a/docs/reference/transform/troubleshooting.asciidoc
+++ b/docs/reference/transform/troubleshooting.asciidoc
@@ -20,7 +20,7 @@ by your `transform_id`.
 information about the {transform} status and failures.
 * If the {transform} exists as a task, you can use the
 <<tasks,task management API>> to gather task information. For example:
-`GET _tasks?actions=data_frame/transforms*&detailed=true`. Typically, the task exists
+`GET _tasks?actions=data_frame/transforms*&detailed`. Typically, the task exists
 when the {transform} is in a started or failed state.
 * The {es} logs from the node that was running the {transform} might
 also contain useful information. You can identify the node from the notification
diff --git a/docs/reference/troubleshooting/common-issues/high-cpu-usage.asciidoc b/docs/reference/troubleshooting/common-issues/high-cpu-usage.asciidoc
index 5b7a68e27a9a5..96a9a8f1e32b7 100644
--- a/docs/reference/troubleshooting/common-issues/high-cpu-usage.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/high-cpu-usage.asciidoc
@@ -75,7 +75,7 @@ for these searches, use the <<tasks,task management API>>.
 
 [source,console]
 ----
-GET _tasks?actions=*search&detailed=true
+GET _tasks?actions=*search&detailed
 ----
 
 The response's `description` contains the search request and its queries.
diff --git a/docs/reference/troubleshooting/common-issues/hotspotting.asciidoc b/docs/reference/troubleshooting/common-issues/hotspotting.asciidoc
index 236359e9845d0..a5c6b1bd5fbca 100644
--- a/docs/reference/troubleshooting/common-issues/hotspotting.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/hotspotting.asciidoc
@@ -263,7 +263,7 @@ further insight on it via <<tasks,the task management API>>,
 
 [source,console]
 ----
-GET _tasks?pretty=true&human=true&detailed=true
+GET _tasks?human&detailed&pretty
 ----
 
 Its response contains a `description` that reports this query:
diff --git a/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc b/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
index e9b6ad1620401..5a4d3a4e57d5e 100644
--- a/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
@@ -73,12 +73,12 @@ The API response may contain additional tasks columns, including `description` a
 **Look for long running cluster tasks**
 
 A task backlog might also appear as a delay in synchronizing the cluster state. You
-can use the <<cat-pending-tasks,cat pending tasks API>> to get information
+can use the <<cluster-pending,cluster pending tasks API>> to get information
 about the pending cluster state sync tasks that are running. 
 
 [source,console]
 ----
-GET /_cat/pending_tasks?v=true
+GET /_cluster/pending_tasks?detailed
 ----
 
 Check the `timeInQueue` to identify tasks that are taking an excessive amount 

From 5925b61f8ee55da067f7b8a0d4c9f96543730c92 Mon Sep 17 00:00:00 2001
From: Stef Nestor <26751266+stefnestor@users.noreply.github.com>
Date: Tue, 20 Aug 2024 08:36:22 -0600
Subject: [PATCH 7/9] feedback

---
 .../troubleshooting/common-issues/task-queue-backlog.asciidoc   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc b/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
index 5a4d3a4e57d5e..b30e475b7c151 100644
--- a/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
@@ -78,7 +78,7 @@ about the pending cluster state sync tasks that are running.
 
 [source,console]
 ----
-GET /_cluster/pending_tasks?detailed
+GET /_cluster/pending_tasks
 ----
 
 Check the `timeInQueue` to identify tasks that are taking an excessive amount 

From 9fd490b52991c3b4b0cd6b215aa7867b6b49cecf Mon Sep 17 00:00:00 2001
From: Stef Nestor <26751266+stefnestor@users.noreply.github.com>
Date: Tue, 20 Aug 2024 08:37:24 -0600
Subject: [PATCH 8/9] feedback

Co-authored-by: shainaraskas <58563081+shainaraskas@users.noreply.github.com>
---
 .../common-issues/task-queue-backlog.asciidoc               | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc b/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
index b30e475b7c151..d570456b5de67 100644
--- a/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
@@ -13,9 +13,9 @@ task queue.
 **Check the thread pool status**
 
 A <<high-cpu-usage,depleted thread pool>> can result in
-<<rejected-requests,rejected requests>>. This may surface restricted to a
-<<data-tiers,data tier>>'s traffic, potentially with <<hotspotting,hot spotting>>
-symptoms.
+<<rejected-requests,rejected requests>>. 
+
+Thread pool depletion might be restricted to a specific <<data-tiers,data tier>>. If <<hotspotting,hot spotting>> is occuring, one node might experience depletion faster than other nodes, leading to performance issues and a growing task backlog.
 
 You can use the <<cat-thread-pool,cat thread pool API>> to see the number of
 active threads in each thread pool and how many tasks are queued, how many

From 439a0fa9c8d056e98d4e3b920813bf1dd230a209 Mon Sep 17 00:00:00 2001
From: Stef Nestor <26751266+stefnestor@users.noreply.github.com>
Date: Wed, 11 Sep 2024 15:28:45 -0600
Subject: [PATCH 9/9] feedback

Co-authored-by: David Turner <david.turner@elastic.co>
---
 .../troubleshooting/common-issues/hotspotting.asciidoc        | 2 +-
 .../troubleshooting/common-issues/rejected-requests.asciidoc  | 2 +-
 .../troubleshooting/common-issues/task-queue-backlog.asciidoc | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/reference/troubleshooting/common-issues/hotspotting.asciidoc b/docs/reference/troubleshooting/common-issues/hotspotting.asciidoc
index a5c6b1bd5fbca..a8ca4c7d851d1 100644
--- a/docs/reference/troubleshooting/common-issues/hotspotting.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/hotspotting.asciidoc
@@ -263,7 +263,7 @@ further insight on it via <<tasks,the task management API>>,
 
 [source,console]
 ----
-GET _tasks?human&detailed&pretty
+GET _tasks?human&detailed
 ----
 
 Its response contains a `description` that reports this query:
diff --git a/docs/reference/troubleshooting/common-issues/rejected-requests.asciidoc b/docs/reference/troubleshooting/common-issues/rejected-requests.asciidoc
index 036eff25ddbae..c863709775fcd 100644
--- a/docs/reference/troubleshooting/common-issues/rejected-requests.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/rejected-requests.asciidoc
@@ -56,7 +56,7 @@ rejections, use the <<cluster-nodes-stats,node stats API>>.
 
 [source,console]
 ----
-GET _nodes/stats?human=true&filter_path=nodes.*.indexing_pressure
+GET _nodes/stats?human&filter_path=nodes.*.indexing_pressure
 ----
 
 These stats are cumulative from node startup. 
diff --git a/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc b/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
index d570456b5de67..5aa6a0129c2d4 100644
--- a/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/task-queue-backlog.asciidoc
@@ -58,14 +58,14 @@ If a particular `action` is suspected, you can filter the tasks further. The mos
 +
 [source,console]
 ----
-GET /_tasks?pretty=true&human=true&detailed=true&actions=indices:data/write/bulk
+GET /_tasks?human&detailed&actions=indices:data/write/bulk
 ----
 
 * Filter for search actions:
 +
 [source,console]
 ----
-GET /_tasks?pretty=true&human=true&detailed=true&actions=indices:data/write/search
+GET /_tasks?human&detailed&actions=indices:data/write/search
 ----
 
 The API response may contain additional tasks columns, including `description` and `header`, which provides the task parameters, target, and requestor. You can use this information to perform further diagnosis.