apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: istio-ambient-alerts
  namespace: monitoring
  labels:
    prometheus: kube-prometheus
    role: alert-rules
spec:
  groups:

  - name: istio.cert
    interval: 60s
    rules:

    - alert: IstiodRootCertExpiryWarning
      expr: citadel_server_root_cert_expiry_seconds < 2592000
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "istiod root CA cert expiring in < 30 days"
        description: "Root cert expires in {{ humanizeDuration $value }}. Root CA rotation is disruptive — plan ahead."

    - alert: IstiodRootCertExpiryCritical
      expr: citadel_server_root_cert_expiry_seconds < 604800
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: "istiod root CA cert expiring in < 7 days"
        description: "Root cert expires in {{ humanizeDuration $value }}. Immediate action required."

    - alert: IstiodWorkloadCertExpiryWarning
      expr: citadel_server_cert_chain_expiry_seconds < 86400
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "istiod workload cert chain expiring in < 1 day"
        description: "Cert chain expires in {{ humanizeDuration $value }}."

  - name: istio.multicluster
    interval: 30s
    rules:

    - alert: IstiodRemoteClusterDisconnected
      expr: istiod_managed_clusters{cluster_type="remote"} == 0
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: "istiod has no remote cluster connections"
        description: "Cross-cluster endpoint rewriting is broken. Check istio-remote-secret-* in istio-system and peer istiod logs."

    - alert: IstiodDown
      expr: istiod_managed_clusters{cluster_type="local"} != 1
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: "istiod is not managing its local cluster"
        description: "istiod local cluster gauge != 1. Pod may be crash-looping or metrics endpoint is unreachable."

  - name: istio.xds.sync
    interval: 30s
    rules:

    - alert: IstiodXdsPushStall
      expr: |
        rate(pilot_xds_pushes[5m]) == 0
        and
        pilot_xds > 0
      for: 3m
      labels:
        severity: critical
      annotations:
        summary: "istiod xDS pushes have stalled"
        description: "{{ $labels.type }} push rate is 0 but {{ $value }} proxies are connected. Discovery filter may be blocked — check istiod logs for 'waiting for sync' and auth errors."

    - alert: IstiodProxyConvergenceSlow
      expr: |
        rate(pilot_proxy_convergence_time_sum[5m])
        /
        rate(pilot_proxy_convergence_time_count[5m]) > 1
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Proxy config convergence avg > 1s"
        description: "Average time for a proxy to receive and ACK config is {{ $value | humanizeDuration }}. Push queue may be overloaded."

    - alert: IstiodWdsPushSlow
      expr: |
        rate(pilot_xds_push_time_sum{type="wds"}[5m])
        /
        rate(pilot_xds_push_time_count{type="wds"}[5m]) > 0.5
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "WDS push time avg > 500ms"
        description: "Workload Discovery Service pushes to ztunnel are slow ({{ $value | humanizeDuration }}). May indicate large workload counts or push queue contention."

  - name: istio.config.errors
    interval: 60s
    rules:

    - alert: IstiodStaleEndpoints
      expr: endpoint_no_pod > 0
      for: 60s
      labels:
        severity: warning
      annotations:
        summary: "{{ $value }} endpoints with no backing pod"
        description: "Stale endpoint entries — crashed pod endpoints not cleaned up. Traffic to these endpoints will fail."

    - alert: IstiodEmptyService
      expr: pilot_eds_no_instances > 0
      for: 30s
      labels:
        severity: warning
      annotations:
        summary: "{{ $value }} services have zero endpoints"
        description: "Services with no endpoints will return 503. May indicate a deployment failure or misconfigured selector."

    - alert: IstiodListenerConflict
      expr: pilot_conflict_inbound_listener > 0 or pilot_conflict_outbound_listener_tcp_over_current_tcp > 0
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "istiod listener conflicts detected"
        description: "{{ $value }} conflicting listeners. Traffic may be misrouted. Check for Service port collisions or duplicate ServiceEntries."

    - alert: IstiodConfigPushStorm
      expr: rate(pilot_push_triggers{type="global"}[5m]) > 0.08
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "istiod global push rate > 5/min"
        description: "Frequent full mesh-wide pushes indicate a config storm. Check for a controller repeatedly updating CRDs."

  - name: istio.process
    interval: 60s
    rules:

    - alert: IstiodGoroutineLeak
      expr: go_goroutines{job="istiod"} > 3000
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: "istiod goroutine count > 3000"
        description: "{{ $value }} goroutines. Sustained growth suggests a goroutine leak. Restart istiod if count keeps growing."

    - alert: IstiodHighMemory
      expr: process_resident_memory_bytes{job="istiod"} > 838860800
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: "istiod RSS > 800MB"
        description: "{{ $value | humanize1024 }}B resident memory. On resource-constrained nodes this risks OOMKill."

    - alert: IstiodFdExhaustion
      expr: process_open_fds{job="istiod"} / process_max_fds{job="istiod"} > 0.8
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: "istiod file descriptor usage > 80%"
        description: "{{ $value | humanizePercentage }} of max FDs in use. Approaching exhaustion will cause new XDS stream failures."