groups: - name: qpq-server rules: - alert: QpqServerDown expr: up{job="qpq-server"} == 0 for: 1m labels: severity: critical annotations: summary: "qpq-server is down" description: "Prometheus cannot scrape qpq-server metrics for > 1 minute." - alert: QpqHighAuthFailureRate expr: rate(auth_login_failure_total[5m]) > 10 for: 2m labels: severity: warning annotations: summary: "High authentication failure rate" description: "{{ $value | printf \"%.1f\" }} auth failures/sec over 5 minutes." - alert: QpqRateLimitActive expr: rate(rate_limit_hit_total[5m]) > 5 for: 5m labels: severity: warning annotations: summary: "Rate limiting is actively rejecting requests" - alert: QpqDeliveryQueueHigh expr: delivery_queue_depth > 10000 for: 10m labels: severity: warning annotations: summary: "Delivery queue depth is high ({{ $value }})" - alert: QpqDeliveryQueueCritical expr: delivery_queue_depth > 100000 for: 5m labels: severity: critical annotations: summary: "Delivery queue depth is critical ({{ $value }})" - alert: QpqLowAuthSuccessRatio expr: > rate(auth_login_success_total[5m]) / (rate(auth_login_success_total[5m]) + rate(auth_login_failure_total[5m])) < 0.5 for: 10m labels: severity: warning annotations: summary: "Auth success ratio below 50%"