groups: - name: qpc-server rules: - alert: QpcServerDown expr: up{job="qpc-server"} == 0 for: 1m labels: severity: critical annotations: summary: "qpc-server is down" description: "Prometheus cannot scrape qpc-server metrics for > 1 minute." - alert: QpcHighAuthFailureRate expr: rate(auth_login_failure_total[5m]) > 10 for: 2m labels: severity: warning annotations: summary: "High authentication failure rate" description: "{{ $value | printf \"%.1f\" }} auth failures/sec over 5 minutes." - alert: QpcRateLimitActive expr: rate(rate_limit_hit_total[5m]) > 5 for: 5m labels: severity: warning annotations: summary: "Rate limiting is actively rejecting requests" - alert: QpcDeliveryQueueHigh expr: delivery_queue_depth > 10000 for: 10m labels: severity: warning annotations: summary: "Delivery queue depth is high ({{ $value }})" - alert: QpcDeliveryQueueCritical expr: delivery_queue_depth > 100000 for: 5m labels: severity: critical annotations: summary: "Delivery queue depth is critical ({{ $value }})" - alert: QpcLowAuthSuccessRatio expr: > rate(auth_login_success_total[5m]) / (rate(auth_login_success_total[5m]) + rate(auth_login_failure_total[5m])) < 0.5 for: 10m labels: severity: warning annotations: summary: "Auth success ratio below 50%"