Alerts

Inactive (16) Pending (0) Firing (1)

/etc/config/alerting_rules.yml > BlackBox Alerts

Probe failure (0 active)

alert: Probe failure
expr: probe_success{job=~"compliance-service-check|dpm-service-check",namespace!="kube-system"} == 0
for: 1m
annotations:
  summary: The service {{ $labels.job }} is unreachable or down. please check the cluster for further information.

Public endpoint check (0 active)

alert: Public endpoint check
expr: probe_success{job=~"external.*"} == 0
for: 1m
labels:
  severity: warning
annotations:
  summary: The service {{ $labels.job }} is unreachble from internet, please check if URL is pointing to public endpoint.

/etc/config/alerting_rules.yml > MSSQL Alerts

KubernetesPodNotHealthy (7 active)

alert: KubernetesPodNotHealthy
expr: min_over_time(sum by(namespace, pod) (kube_pod_status_phase{namespace!="kube-system",phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0
labels:
  severity: critical
annotations:
  description: |-
    Pod has been in a non-ready state for longer than 15 minutes.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes Pod not healthy (instance {{ $labels.pod }})

Labels	State	Active Since	Value
alertname="KubernetesPodNotHealthy" namespace="p433" pod="ingress-nginx-controller-6fcfb6b495-njd4k" severity="critical"	firing	2025-11-25 15:36:46.936640537 +0000 UTC	1
Annotations
description Pod has been in a non-ready state for longer than 15 minutes. VALUE = 1 LABELS = map[namespace:p433 pod:ingress-nginx-controller-6fcfb6b495-njd4k] summary Kubernetes Pod not healthy (instance ingress-nginx-controller-6fcfb6b495-njd4k)
alertname="KubernetesPodNotHealthy" namespace="p433" pod="ingress-nginx-controller-6fcfb6b495-kfdjt" severity="critical"	firing	2025-11-25 15:22:46.936640537 +0000 UTC	1
Annotations
description Pod has been in a non-ready state for longer than 15 minutes. VALUE = 1 LABELS = map[namespace:p433 pod:ingress-nginx-controller-6fcfb6b495-kfdjt] summary Kubernetes Pod not healthy (instance ingress-nginx-controller-6fcfb6b495-kfdjt)
alertname="KubernetesPodNotHealthy" namespace="p433" pod="ingress-nginx-controller-6fcfb6b495-hzd96" severity="critical"	firing	2025-12-30 00:16:46.936640537 +0000 UTC	1
Annotations
description Pod has been in a non-ready state for longer than 15 minutes. VALUE = 1 LABELS = map[namespace:p433 pod:ingress-nginx-controller-6fcfb6b495-hzd96] summary Kubernetes Pod not healthy (instance ingress-nginx-controller-6fcfb6b495-hzd96)
alertname="KubernetesPodNotHealthy" namespace="p433" pod="ingress-nginx-controller-6fcfb6b495-gj7lz" severity="critical"	firing	2025-11-10 18:36:46.936640537 +0000 UTC	1
Annotations
description Pod has been in a non-ready state for longer than 15 minutes. VALUE = 1 LABELS = map[namespace:p433 pod:ingress-nginx-controller-6fcfb6b495-gj7lz] summary Kubernetes Pod not healthy (instance ingress-nginx-controller-6fcfb6b495-gj7lz)
alertname="KubernetesPodNotHealthy" namespace="p433" pod="job-licensestatuscheck-0.0.9.228.0-hz2df" severity="critical"	firing	2026-01-27 00:16:46.936640537 +0000 UTC	1
Annotations
description Pod has been in a non-ready state for longer than 15 minutes. VALUE = 1 LABELS = map[namespace:p433 pod:job-licensestatuscheck-0.0.9.228.0-hz2df] summary Kubernetes Pod not healthy (instance job-licensestatuscheck-0.0.9.228.0-hz2df)
alertname="KubernetesPodNotHealthy" namespace="p433" pod="ingress-nginx-controller-6fcfb6b495-wtm6m" severity="critical"	firing	2025-12-20 10:13:46.936640537 +0000 UTC	1
Annotations
description Pod has been in a non-ready state for longer than 15 minutes. VALUE = 1 LABELS = map[namespace:p433 pod:ingress-nginx-controller-6fcfb6b495-wtm6m] summary Kubernetes Pod not healthy (instance ingress-nginx-controller-6fcfb6b495-wtm6m)
alertname="KubernetesPodNotHealthy" namespace="p433" pod="ingress-nginx-controller-6fcfb6b495-n76ql" severity="critical"	firing	2025-12-10 20:40:46.936640537 +0000 UTC	1
Annotations
description Pod has been in a non-ready state for longer than 15 minutes. VALUE = 1 LABELS = map[namespace:p433 pod:ingress-nginx-controller-6fcfb6b495-n76ql] summary Kubernetes Pod not healthy (instance ingress-nginx-controller-6fcfb6b495-n76ql)

DatabaseMaintainenceJobCountIncreased (0 active)

alert: DatabaseMaintainenceJobCountIncreased
expr: jobcount{job="prometheus-query-exporter"} > 0
for: 1m
annotations:
  description: Database Maintainence Job count for database
  summary: Database Maintainence Job count for database

HostHighCpuLoad (0 active)

alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    CPU load is > 80%
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host high CPU load (instance {{ $labels.pod }})

KubernetesDiskPressure (0 active)

alert: KubernetesDiskPressure
expr: kube_node_status_condition{condition="DiskPressure",namespace!="kube-system",status="true"} == 1
for: 2m
labels:
  severity: critical
annotations:
  description: |-
    {{ $labels.node }} has DiskPressure condition
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes disk pressure (instance {{ $labels.pod }})

KubernetesMemoryPressure (0 active)

alert: KubernetesMemoryPressure
expr: kube_node_status_condition{condition="MemoryPressure",namespace!="kube-system",status="true"} == 1
for: 2m
labels:
  severity: critical
annotations:
  description: |-
    {{ $labels.node }} has MemoryPressure condition
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes memory pressure (instance {{ $labels.pod }})

KubernetesNodeReady (0 active)

alert: KubernetesNodeReady
expr: kube_node_status_condition{condition="Ready",namespace!="kube-system",status="true"} == 0
for: 10m
labels:
  severity: critical
annotations:
  description: |-
    Node {{ $labels.node }} has been unready for a long time
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes Node ready (instance {{ $labels.pod }})

KubernetesOutOfCapacity (0 active)

alert: KubernetesOutOfCapacity
expr: sum by(node) ((kube_pod_status_phase{namespace!="kube-system",phase="Running"} == 1) + on(pod, namespace) group_left(node) (0 * kube_pod_info)) / sum by(node) (kube_node_status_allocatable_pods{namespace!="kube-system"}) * 100 > 90
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    {{ $labels.node }} is out of capacity
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes out of capacity (instance {{ $labels.pod }})

KubernetesOutOfDisk (0 active)

alert: KubernetesOutOfDisk
expr: kube_node_status_condition{condition="OutOfDisk",namespace!="kube-system",status="true"} == 1
for: 2m
labels:
  severity: critical
annotations:
  description: |-
    {{ $labels.node }} has OutOfDisk condition
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes out of disk (instance {{ $labels.pod }})

KubernetesPersistentvolumeError (0 active)

alert: KubernetesPersistentvolumeError
expr: kube_persistentvolume_status_phase{job="kube-state-metrics",namespace!="kube-system",phase=~"Failed|Pending"} > 0
labels:
  severity: critical
annotations:
  description: |-
    Persistent volume is in bad state
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes PersistentVolume error (instance {{ $labels.pod }})

KubernetesPersistentvolumeclaimPending (0 active)

alert: KubernetesPersistentvolumeclaimPending
expr: kube_persistentvolumeclaim_status_phase{namespace!="kube-system",phase="Pending"} == 1
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.pod }})

KubernetesPodCrashLooping (0 active)

alert: KubernetesPodCrashLooping
expr: increase(kube_pod_container_status_restarts_total{namespace!="kube-system"}[1m]) > 3
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Pod {{ $labels.pod }} is crash looping
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes pod crash looping (instance {{ $labels.pod }})

KubernetesVolumeOutOfDiskSpace (0 active)

alert: KubernetesVolumeOutOfDiskSpace
expr: sum without(beta_kubernetes_io_arch, beta_kubernetes_io_instance_type, failure_domain_beta_kubernetes_io_region, kubernetes_azure_com_cluster, kubernetes_azure_com_node_image_version, kubernetes_azure_com_role, kubernetes_io_arch, kubernetes_io_hostname, kubernetes_io_os, kubernetes_io_role, node_kubernetes_io_instance_type, topology_kubernetes_io_region, topology_kubernetes_io_zone, failure_domain_beta_kubernetes_io_zone) (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10)
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Volume is almost full (< 10% left)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Kubernetes Volume out of disk space (instance {{ $labels.pod }})

MSSQL connectivity alert (0 active)

alert: MSSQL connectivity alert
expr: up{job="prometheus-mssql-exporter"} == 0
for: 1m
labels:
  severity: Critical
annotations:
  summary: The service {{ $labels.job }} is unreachable or down. please check the MSSQL for further information.

compliance alert (0 active)

alert: compliance alert
expr: probe_success{job="compliance",namespace!="kube-system"} == 1
labels:
  severity: warning
annotations:
  summary: The service {{ $labels.job }} compliance is enabled.

compliance alert (0 active)

alert: compliance alert
expr: probe_success{job="compliance",namespace!="kube-system"} == 0
labels:
  Notification: None
  severity: warning
annotations:
  summary: The service {{ $labels.job }} compliance is disabled.