docs: add operational runbook, Grafana dashboard, and production docker-compose

Add comprehensive operational documentation:
- docs/operations/backup-restore.md: SQLCipher, file backend, blob backup/restore
- docs/operations/key-rotation.md: auth token, TLS, federation, DB key, OPAQUE rotation
- docs/operations/incident-response.md: playbook for common incidents
- docs/operations/scaling-guide.md: resource sizing, scaling triggers, capacity planning
- docs/operations/monitoring.md: Prometheus metrics, alert rules, log monitoring
- docs/operations/dashboards/qpq-overview.json: Grafana dashboard template
- docs/operations/prometheus.yml + alerts: Prometheus scrape and alert config
- docs/operations/grafana-provisioning/: auto-provisioning for datasources and dashboards
- docker-compose.prod.yml: production stack (server + Prometheus + Grafana)
- .env.example: documented environment variable template
This commit is contained in:
2026-03-04 20:30:57 +01:00
parent b94248b3b6
commit 91c5495ab7
12 changed files with 1872 additions and 0 deletions

View File

@@ -0,0 +1,395 @@
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "10.0.0"
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{
"type": "panel",
"id": "timeseries",
"name": "Time series",
"version": ""
},
{
"type": "panel",
"id": "stat",
"name": "Stat",
"version": ""
},
{
"type": "panel",
"id": "gauge",
"name": "Gauge",
"version": ""
}
],
"id": null,
"uid": "qpq-overview",
"title": "quicproquo Server Overview",
"description": "Operational dashboard for quicproquo server instances",
"tags": ["quicproquo", "qpq"],
"timezone": "browser",
"editable": true,
"graphTooltip": 1,
"refresh": "10s",
"time": {
"from": "now-1h",
"to": "now"
},
"panels": [
{
"title": "Server Status",
"type": "stat",
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 },
"targets": [
{
"expr": "up{job=\"qpq-server\"}",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"mappings": [
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" }, "1": { "text": "UP", "color": "green" } } }
],
"thresholds": {
"steps": [
{ "value": null, "color": "red" },
{ "value": 1, "color": "green" }
]
}
}
}
},
{
"title": "Enqueue Rate",
"type": "stat",
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 },
"targets": [
{
"expr": "rate(enqueue_total[5m])",
"legendFormat": "msgs/sec"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"thresholds": {
"steps": [
{ "value": null, "color": "green" },
{ "value": 100, "color": "yellow" },
{ "value": 500, "color": "red" }
]
}
}
}
},
{
"title": "Fetch Rate",
"type": "stat",
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 },
"targets": [
{
"expr": "rate(fetch_total[5m])",
"legendFormat": "fetches/sec"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"thresholds": {
"steps": [
{ "value": null, "color": "green" }
]
}
}
}
},
{
"title": "Auth Success Rate",
"type": "gauge",
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 0 },
"targets": [
{
"expr": "rate(auth_login_success_total[5m]) / (rate(auth_login_success_total[5m]) + rate(auth_login_failure_total[5m]))",
"legendFormat": "success ratio"
}
],
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"min": 0,
"max": 1,
"thresholds": {
"steps": [
{ "value": null, "color": "red" },
{ "value": 0.5, "color": "yellow" },
{ "value": 0.9, "color": "green" }
]
}
}
}
},
{
"title": "Delivery Queue Depth",
"type": "stat",
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 0 },
"targets": [
{
"expr": "delivery_queue_depth",
"legendFormat": "depth"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{ "value": null, "color": "green" },
{ "value": 10000, "color": "yellow" },
{ "value": 100000, "color": "red" }
]
}
}
}
},
{
"title": "Rate Limit Hits",
"type": "stat",
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 0 },
"targets": [
{
"expr": "rate(rate_limit_hit_total[5m])",
"legendFormat": "hits/sec"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"thresholds": {
"steps": [
{ "value": null, "color": "green" },
{ "value": 1, "color": "yellow" },
{ "value": 10, "color": "red" }
]
}
}
}
},
{
"title": "Message Throughput",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"targets": [
{
"expr": "rate(enqueue_total[5m])",
"legendFormat": "enqueue rate"
},
{
"expr": "rate(fetch_total[5m])",
"legendFormat": "fetch rate"
},
{
"expr": "rate(fetch_wait_total[5m])",
"legendFormat": "fetch_wait rate"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": {
"drawStyle": "line",
"lineInterpolation": "smooth",
"fillOpacity": 10
}
}
}
},
{
"title": "Enqueue Bandwidth",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"targets": [
{
"expr": "rate(enqueue_bytes_total[5m])",
"legendFormat": "bytes/sec"
}
],
"fieldConfig": {
"defaults": {
"unit": "Bps",
"custom": {
"drawStyle": "line",
"lineInterpolation": "smooth",
"fillOpacity": 10,
"gradientMode": "scheme"
}
}
}
},
{
"title": "Authentication",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
"targets": [
{
"expr": "rate(auth_login_success_total[5m])",
"legendFormat": "success/sec"
},
{
"expr": "rate(auth_login_failure_total[5m])",
"legendFormat": "failure/sec"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": {
"drawStyle": "line",
"lineInterpolation": "smooth",
"fillOpacity": 10
}
},
"overrides": [
{
"matcher": { "id": "byName", "options": "failure/sec" },
"properties": [
{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }
]
}
]
}
},
{
"title": "Delivery Queue Depth Over Time",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
"targets": [
{
"expr": "delivery_queue_depth",
"legendFormat": "queue depth"
}
],
"fieldConfig": {
"defaults": {
"custom": {
"drawStyle": "line",
"lineInterpolation": "smooth",
"fillOpacity": 20,
"gradientMode": "scheme",
"thresholdsStyle": { "mode": "area" }
},
"thresholds": {
"steps": [
{ "value": null, "color": "green" },
{ "value": 10000, "color": "yellow" },
{ "value": 100000, "color": "red" }
]
}
}
}
},
{
"title": "Rate Limiting & Key Packages",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 },
"targets": [
{
"expr": "rate(rate_limit_hit_total[5m])",
"legendFormat": "rate limit hits/sec"
},
{
"expr": "rate(key_package_upload_total[5m])",
"legendFormat": "key package uploads/sec"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops",
"custom": {
"drawStyle": "line",
"lineInterpolation": "smooth",
"fillOpacity": 10
}
},
"overrides": [
{
"matcher": { "id": "byName", "options": "rate limit hits/sec" },
"properties": [
{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }
]
}
]
}
},
{
"title": "Cumulative Totals",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 },
"targets": [
{
"expr": "enqueue_total",
"legendFormat": "total enqueued"
},
{
"expr": "fetch_total",
"legendFormat": "total fetched"
},
{
"expr": "auth_login_success_total",
"legendFormat": "total logins"
}
],
"fieldConfig": {
"defaults": {
"custom": {
"drawStyle": "line",
"lineInterpolation": "smooth",
"fillOpacity": 5
}
}
}
}
],
"templating": {
"list": [
{
"name": "datasource",
"type": "datasource",
"query": "prometheus",
"current": {},
"hide": 0
}
]
},
"annotations": {
"list": [
{
"name": "Deploys",
"datasource": "-- Grafana --",
"enable": true,
"iconColor": "blue",
"tags": ["deploy"]
}
]
},
"schemaVersion": 38,
"version": 1
}