Files
quicproquo/docs/operations/prometheus-alerts.yml
Christian Nennemann 2e081ead8e chore: rename quicproquo → quicprochat in docs, Docker, CI, and packaging
Rename all project references from quicproquo/qpq to quicprochat/qpc
across documentation, Docker configuration, CI workflows, packaging
scripts, operational configs, and build tooling.

- Docker: crate paths, binary names, user/group, data dirs, env vars
- CI: workflow crate references, binary names, artifact names
- Docs: all markdown files under docs/, SDK READMEs, book.toml
- Packaging: OpenWrt Makefile, init script, UCI config (file renames)
- Scripts: justfile, dev-shell, screenshot, cross-compile, ai_team
- Operations: Prometheus config, alert rules, Grafana dashboard
- Config: .env.example (QPQ_* → QPC_*), CODEOWNERS paths
- Top-level: README, CONTRIBUTING, ROADMAP, CLAUDE.md
2026-03-21 19:14:06 +01:00

56 lines
1.6 KiB
YAML

groups:
- name: qpc-server
rules:
- alert: QpcServerDown
expr: up{job="qpc-server"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "qpc-server is down"
description: "Prometheus cannot scrape qpc-server metrics for > 1 minute."
- alert: QpcHighAuthFailureRate
expr: rate(auth_login_failure_total[5m]) > 10
for: 2m
labels:
severity: warning
annotations:
summary: "High authentication failure rate"
description: "{{ $value | printf \"%.1f\" }} auth failures/sec over 5 minutes."
- alert: QpcRateLimitActive
expr: rate(rate_limit_hit_total[5m]) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Rate limiting is actively rejecting requests"
- alert: QpcDeliveryQueueHigh
expr: delivery_queue_depth > 10000
for: 10m
labels:
severity: warning
annotations:
summary: "Delivery queue depth is high ({{ $value }})"
- alert: QpcDeliveryQueueCritical
expr: delivery_queue_depth > 100000
for: 5m
labels:
severity: critical
annotations:
summary: "Delivery queue depth is critical ({{ $value }})"
- alert: QpcLowAuthSuccessRatio
expr: >
rate(auth_login_success_total[5m])
/ (rate(auth_login_success_total[5m]) + rate(auth_login_failure_total[5m]))
< 0.5
for: 10m
labels:
severity: warning
annotations:
summary: "Auth success ratio below 50%"