docs: add operational runbook, Grafana dashboard, and production docker-compose
Add comprehensive operational documentation: - docs/operations/backup-restore.md: SQLCipher, file backend, blob backup/restore - docs/operations/key-rotation.md: auth token, TLS, federation, DB key, OPAQUE rotation - docs/operations/incident-response.md: playbook for common incidents - docs/operations/scaling-guide.md: resource sizing, scaling triggers, capacity planning - docs/operations/monitoring.md: Prometheus metrics, alert rules, log monitoring - docs/operations/dashboards/qpq-overview.json: Grafana dashboard template - docs/operations/prometheus.yml + alerts: Prometheus scrape and alert config - docs/operations/grafana-provisioning/: auto-provisioning for datasources and dashboards - docker-compose.prod.yml: production stack (server + Prometheus + Grafana) - .env.example: documented environment variable template
This commit is contained in:
55
docs/operations/prometheus-alerts.yml
Normal file
55
docs/operations/prometheus-alerts.yml
Normal file
@@ -0,0 +1,55 @@
|
||||
groups:
|
||||
- name: qpq-server
|
||||
rules:
|
||||
- alert: QpqServerDown
|
||||
expr: up{job="qpq-server"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "qpq-server is down"
|
||||
description: "Prometheus cannot scrape qpq-server metrics for > 1 minute."
|
||||
|
||||
- alert: QpqHighAuthFailureRate
|
||||
expr: rate(auth_login_failure_total[5m]) > 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High authentication failure rate"
|
||||
description: "{{ $value | printf \"%.1f\" }} auth failures/sec over 5 minutes."
|
||||
|
||||
- alert: QpqRateLimitActive
|
||||
expr: rate(rate_limit_hit_total[5m]) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Rate limiting is actively rejecting requests"
|
||||
|
||||
- alert: QpqDeliveryQueueHigh
|
||||
expr: delivery_queue_depth > 10000
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Delivery queue depth is high ({{ $value }})"
|
||||
|
||||
- alert: QpqDeliveryQueueCritical
|
||||
expr: delivery_queue_depth > 100000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Delivery queue depth is critical ({{ $value }})"
|
||||
|
||||
- alert: QpqLowAuthSuccessRatio
|
||||
expr: >
|
||||
rate(auth_login_success_total[5m])
|
||||
/ (rate(auth_login_success_total[5m]) + rate(auth_login_failure_total[5m]))
|
||||
< 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Auth success ratio below 50%"
|
||||
Reference in New Issue
Block a user