Skip to content

Prometheus Cheatsheet – Kubernetes / OpenShift (WR & Produzione)

🧠 Prometheus Cheatsheet – Kubernetes / OpenShift (WR & Produzione)

Cheatsheet operativo per troubleshooting rapido in produzione (WR / incident). Tutte le query escludono i namespace di sistema (kube-*, openshift-*).


🎯 Filtro standard namespace applicativi

namespace!~"kube-.*|openshift-.*"

🔥 CPU

CPU usage per pod

sum by (namespace,pod) (
rate(container_cpu_usage_seconds_total{
namespace!~"kube-.*|openshift-.*",
container!="",container!="POD"
}[5m])
)

CPU usage vs limit (rischio throttling)

(
sum by (namespace,pod)(
rate(container_cpu_usage_seconds_total{
namespace!~"kube-.*|openshift-.*"
}[5m])
)
)
/
(
sum by (namespace,pod)(
container_spec_cpu_quota{
namespace!~"kube-.*|openshift-.*"
} / container_spec_cpu_period
)
)

CPU throttling (evento certo)

rate(container_cpu_cfs_throttled_seconds_total{
namespace!~"kube-.*|openshift-.*"
}[5m])

🧠 MEMORIA

Memory usage per pod

container_memory_working_set_bytes{
namespace!~"kube-.*|openshift-.*",
container!="",container!="POD"
}

Memory usage vs limit (rischio OOM)

container_memory_working_set_bytes{
namespace!~"kube-.*|openshift-.*"
}
/
container_spec_memory_limit_bytes{
namespace!~"kube-.*|openshift-.*"
}

OOMKilled – stato attuale

kube_pod_container_status_last_terminated_reason{
namespace!~"kube-.*|openshift-.*",
reason="OOMKilled"
}

OOMKilled – storico (24h)

increase(
kube_pod_container_status_last_terminated_reason{
namespace!~"kube-.*|openshift-.*",
reason="OOMKilled"
}[24h]
)

🔁 RESTART / CRASHLOOP

Restart per pod

increase(
kube_pod_container_status_restarts_total{
namespace!~"kube-.*|openshift-.*"
}[15m]
)

🌐 RETE

Traffico OUT per pod

sum by (namespace,pod)(
rate(container_network_transmit_bytes_total{
namespace!~"kube-.*|openshift-.*"
}[5m])
)

Traffico IN per pod

sum by (namespace,pod)(
rate(container_network_receive_bytes_total{
namespace!~"kube-.*|openshift-.*"
}[5m])
)

Spike di rete (batch / sync)

max_over_time(
rate(container_network_transmit_bytes_total{
namespace!~"kube-.*|openshift-.*"
}[1m])[1h:]
)

💾 STORAGE / FILESYSTEM

Scritture filesystem per pod

rate(container_fs_writes_bytes_total{
namespace!~"kube-.*|openshift-.*"
}[5m])

Letture filesystem per pod

rate(container_fs_reads_bytes_total{
namespace!~"kube-.*|openshift-.*"
}[5m])

I/O elevato (job / DB / export)

rate(container_fs_writes_bytes_total{
namespace!~"kube-.*|openshift-.*"
}[1m])
+
rate(container_fs_reads_bytes_total{
namespace!~"kube-.*|openshift-.*"
}[1m])

PVC capacity vs usage

kubelet_volume_stats_used_bytes{
namespace!~"kube-.*|openshift-.*"
}
/
kubelet_volume_stats_capacity_bytes{
namespace!~"kube-.*|openshift-.*"
}

PVC quasi full (>80%)

(
kubelet_volume_stats_used_bytes{
namespace!~"kube-.*|openshift-.*"
}
/
kubelet_volume_stats_capacity_bytes{
namespace!~"kube-.*|openshift-.*"
}
) > 0.8

🕒 JOB / CRONJOB

Job completati

kube_job_status_succeeded{
namespace!~"kube-.*|openshift-.*"
}

Job falliti

kube_job_status_failed{
namespace!~"kube-.*|openshift-.*"
}

Pod short-lived (firma job)

count_over_time(
kube_pod_container_status_running{
namespace!~"kube-.*|openshift-.*"
}[15m]
) < 15

📊 STATO GENERALE

Pod non Ready

kube_pod_status_ready{
namespace!~"kube-.*|openshift-.*",
condition="false"
}

Numero pod per namespace

count by (namespace)(
kube_pod_info{
namespace!~"kube-.*|openshift-.*"
}
)

📊 CAPACITY

Utilizzo reale di CPU sui nodi (media mobile a 5 minuti, escluso idle)

cluster:node_cpu:ratio_rate5m{cluster=""}

Percentuale di CPU richieste dai pod rispetto alle CPU allocabili del cluster

sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=""})
/ sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu",cluster=""})

Percentuale di CPU limits impostati dai pod rispetto alle CPU allocabili del cluster

sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=""})
/ sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu",cluster=""})
sum(:node_memory_MemAvailable_bytes:sum{cluster=""}) / sum(node_memory_MemTotal_bytes{job="node-exporter",cluster=""})
sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=""}) / sum(kube_node_status_allocatable{job="kube-state-metrics",resource="memory",cluster=""})
sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=""}) / sum(kube_node_status_allocatable{job="kube-state-metrics",resource="memory",cluster=""})

CPU Usage per nodo worker %

100 *
max by (nodename) (
(
1 -
avg by (instance) (
rate(node_cpu_seconds_total{mode="idle"}[5m])
)
)
* on(instance) group_left(nodename)
node_uname_info
* on(nodename) group_left()
label_replace(
kube_node_role{role="worker"},
"nodename",
"$1",
"node",
"(.*)"
)
)

CPU Usage per nodo worker (core)

max by (nodename) (
(
sum by (instance) (
rate(node_cpu_seconds_total{mode!~"idle|iowait|steal"}[5m])
)
)
* on(instance) group_left(nodename)
node_uname_info
* on(nodename) group_left()
label_replace(
kube_node_role{role="worker"},
"nodename",
"$1",
"node",
"(.*)"
)
)

CPU Requests per nodo worker

100 *
sum by (node) (
kube_pod_container_resource_requests{resource="cpu", unit="core"}
* on(namespace, pod) group_left(node)
kube_pod_info{node=~".*worker.*"}
)
/
sum by (node) (
kube_node_status_allocatable{resource="cpu", unit="core", node=~".*worker.*"}
)

CPU Limits per nodo worker

100 * sum(kube_pod_container_resource_limits{resource="cpu", unit="core"}) by (node)
/ sum(kube_node_status_allocatable{resource="cpu", unit="core"}) by (node)

RAM usata per nodo worker %

100 *
max by (nodename) (
instance:node_memory_utilisation:ratio
* on(instance) group_left(nodename)
node_uname_info
* on(nodename) group_left()
label_replace(
kube_node_role{role="worker"},
"nodename",
"$1",
"node",
"(.*)"
)
)

RAM usata per nodo worker GBi

max by (nodename) (
(
(
node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
)
* on(instance) group_left(nodename)
node_uname_info
* on(nodename) group_left()
label_replace(
kube_node_role{role="worker"},
"nodename",
"$1",
"node",
"(.*)"
)
)
)
/
1024 / 1024 / 1024

RAM Requests per nodo worker

100 * sum(kube_pod_container_resource_requests{resource="memory", unit="byte"}) by (node)
/ sum(kube_node_status_allocatable{resource="memory", unit="byte"}) by (node)
and on(node) kube_node_status_allocatable{node=~".*worker.*"}

RAM Limits per nodo worker

100 * sum(kube_pod_container_resource_limits{resource="memory", unit="byte"}) by (node)
/ sum(kube_node_status_allocatable{resource="memory", unit="byte"}) by (node)
and on(node) kube_node_status_allocatable{node=~".*worker.*"}

RAM disponibile per scheduling (requests)

(
sum(kube_node_status_allocatable{resource="memory", unit="byte", node=~".*worker.*"}) by (node)
-
sum(kube_pod_container_resource_requests{resource="memory", unit="byte", node=~".*worker.*"}) by (node)
) / 1024 / 1024 / 1024

CPU disponibile per scheduling (requests)

(
sum(kube_node_status_allocatable{resource="cpu", unit="core", node=~".*worker.*"}) by (node)
-
sum(kube_pod_container_resource_requests{resource="cpu", unit="core", node=~".*worker.*"}) by (node)
)

% MAX/AVERAGE Memory:

avg_over_time(instance:node_memory_utilisation:ratio{job="node-exporter"}[30d])*100
max_over_time(instance:node_memory_utilisation:ratio{job="node-exporter"}[30d])*100

% MAX/AVERAGE CPU:

avg_over_time(instance:node_cpu_utilisation:rate1m{job="node-exporter"} [30d]) * 100
max_over_time(instance:node_cpu_utilisation:rate1m{job="node-exporter"} [30d]) * 100

% RAM DISPONIBILE

(node_memory_MemAvailable_bytes{instance=~".*worker.*"} / node_memory_MemTotal_bytes{instance=~".*worker.*"}) * 100

% CPU DISPONIBILE

100 * (
sum by (instance) (rate(node_cpu_seconds_total{mode="idle", instance=~".*worker.*"}[5m]))
/
sum by (instance) (rate(node_cpu_seconds_total{instance=~".*worker.*"}[5m]))
)

CPU REQUEST TOTALE NODI WORKER

sum(
kube_pod_container_resource_requests{resource="cpu", unit="core"}
* on(node) group_left(role)
kube_node_role{role="worker"}
)

RAM REQUEST TOTALE NODI WORKER

sum(
kube_pod_container_resource_requests{resource="memory", unit="byte"}
* on(node) group_left(role)
kube_node_role{role="worker"}
) / 1024^3

CPU USATA TOTALE NODI WORKER

sum(
rate(container_cpu_usage_seconds_total{container!="", container!="POD", pod!=""}[5m])
* on(namespace, pod) group_left(node)
max by(namespace, pod, node) (
kube_pod_info
)
* on(node) group_left()
max by(node) (
kube_node_role{role="worker"}
)
)

RAM USATA TOTALE NODI WORKER

sum(
container_memory_working_set_bytes{container!="", container!="POD", pod!=""}
* on(namespace, pod) group_left(node)
max by(namespace, pod, node) (
kube_pod_info
)
* on(node) group_left()
max by(node) (
kube_node_role{role="worker"}
)
) / 1024^3

🧾 Nota operativa WR

In caso di incidente, partire sempre da: CPU → Memoria → Restart → Rete → Storage → Job
e correlare spike + eventi per identificare la root cause.