From 91c5495ab752bb21b0d63ca9bd5b95f816db6cd3 Mon Sep 17 00:00:00 2001 From: Christian Nennemann Date: Wed, 4 Mar 2026 20:30:57 +0100 Subject: [PATCH] docs: add operational runbook, Grafana dashboard, and production docker-compose Add comprehensive operational documentation: - docs/operations/backup-restore.md: SQLCipher, file backend, blob backup/restore - docs/operations/key-rotation.md: auth token, TLS, federation, DB key, OPAQUE rotation - docs/operations/incident-response.md: playbook for common incidents - docs/operations/scaling-guide.md: resource sizing, scaling triggers, capacity planning - docs/operations/monitoring.md: Prometheus metrics, alert rules, log monitoring - docs/operations/dashboards/qpq-overview.json: Grafana dashboard template - docs/operations/prometheus.yml + alerts: Prometheus scrape and alert config - docs/operations/grafana-provisioning/: auto-provisioning for datasources and dashboards - docker-compose.prod.yml: production stack (server + Prometheus + Grafana) - .env.example: documented environment variable template --- .env.example | 20 + docker-compose.prod.yml | 113 +++++ docs/operations/backup-restore.md | 199 +++++++++ docs/operations/dashboards/qpq-overview.json | 395 ++++++++++++++++++ .../dashboards/default.yml | 12 + .../datasources/prometheus.yml | 9 + docs/operations/incident-response.md | 338 +++++++++++++++ docs/operations/key-rotation.md | 250 +++++++++++ docs/operations/monitoring.md | 225 ++++++++++ docs/operations/prometheus-alerts.yml | 55 +++ docs/operations/prometheus.yml | 12 + docs/operations/scaling-guide.md | 244 +++++++++++ 12 files changed, 1872 insertions(+) create mode 100644 .env.example create mode 100644 docker-compose.prod.yml create mode 100644 docs/operations/backup-restore.md create mode 100644 docs/operations/dashboards/qpq-overview.json create mode 100644 docs/operations/grafana-provisioning/dashboards/default.yml create mode 100644 docs/operations/grafana-provisioning/datasources/prometheus.yml create mode 100644 docs/operations/incident-response.md create mode 100644 docs/operations/key-rotation.md create mode 100644 docs/operations/monitoring.md create mode 100644 docs/operations/prometheus-alerts.yml create mode 100644 docs/operations/prometheus.yml create mode 100644 docs/operations/scaling-guide.md diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..66a1f20 --- /dev/null +++ b/.env.example @@ -0,0 +1,20 @@ +# quicproquo Production Environment Variables +# Copy this file to .env and fill in the values. + +# Server auth token (required, >= 16 characters) +QPQ_AUTH_TOKEN= + +# SQLCipher database encryption key (required for store_backend=sql) +QPQ_DB_KEY= + +# Ports (defaults shown) +QPQ_LISTEN_PORT=7000 +QPQ_WS_PORT=9000 + +# Optional features +QPQ_SEALED_SENDER=false +QPQ_REDACT_LOGS=true +QPQ_WS_LISTEN= + +# Grafana admin password +GRAFANA_ADMIN_PASSWORD=changeme diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml new file mode 100644 index 0000000..0b6f18f --- /dev/null +++ b/docker-compose.prod.yml @@ -0,0 +1,113 @@ +# Production Docker Compose for quicproquo +# +# Usage: +# 1. Copy .env.example to .env and fill in secrets +# 2. Place TLS certificates in ./certs/ +# 3. docker compose -f docker-compose.prod.yml up -d +# +# Prerequisites: +# - TLS certificate and key in DER format (no auto-generation in production) +# - Strong auth token (>= 16 characters) +# - Database encryption key + +networks: + qpq: + driver: bridge + +volumes: + qpq-data: + prometheus-data: + grafana-data: + +services: + # ── quicproquo server ──────────────────────────────────────────────────────── + server: + build: + context: . + dockerfile: docker/Dockerfile + restart: unless-stopped + ports: + - "${QPQ_LISTEN_PORT:-7000}:7000/udp" # QUIC + - "${QPQ_WS_PORT:-9000}:9000" # WebSocket bridge (optional) + environment: + RUST_LOG: info + QPQ_PRODUCTION: "true" + QPQ_LISTEN: "0.0.0.0:7000" + QPQ_DATA_DIR: /var/lib/quicproquo + QPQ_TLS_CERT: /var/lib/quicproquo/certs/server-cert.der + QPQ_TLS_KEY: /var/lib/quicproquo/certs/server-key.der + QPQ_AUTH_TOKEN: "${QPQ_AUTH_TOKEN}" + QPQ_STORE_BACKEND: sql + QPQ_DB_PATH: /var/lib/quicproquo/qpq.db + QPQ_DB_KEY: "${QPQ_DB_KEY}" + QPQ_METRICS_LISTEN: "0.0.0.0:9090" + QPQ_METRICS_ENABLED: "true" + QPQ_SEALED_SENDER: "${QPQ_SEALED_SENDER:-false}" + QPQ_REDACT_LOGS: "${QPQ_REDACT_LOGS:-true}" + QPQ_WS_LISTEN: "${QPQ_WS_LISTEN:-}" + volumes: + - qpq-data:/var/lib/quicproquo + - ./certs:/var/lib/quicproquo/certs:ro + networks: + - qpq + deploy: + resources: + limits: + cpus: '4' + memory: 4G + reservations: + cpus: '2' + memory: 1G + ulimits: + nofile: + soft: 65536 + hard: 65536 + healthcheck: + test: ["CMD", "test", "-f", "/var/lib/quicproquo/certs/server-cert.der"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 10s + logging: + driver: json-file + options: + max-size: "50m" + max-file: "5" + + # ── Prometheus ─────────────────────────────────────────────────────────────── + prometheus: + image: prom/prometheus:latest + restart: unless-stopped + ports: + - "127.0.0.1:9091:9090" + volumes: + - prometheus-data:/prometheus + - ./docs/operations/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./docs/operations/prometheus-alerts.yml:/etc/prometheus/alerts.yml:ro + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--storage.tsdb.retention.time=30d' + - '--web.enable-lifecycle' + networks: + - qpq + depends_on: + - server + + # ── Grafana ────────────────────────────────────────────────────────────────── + grafana: + image: grafana/grafana:latest + restart: unless-stopped + ports: + - "127.0.0.1:3000:3000" + environment: + GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD:-changeme}" + GF_USERS_ALLOW_SIGN_UP: "false" + volumes: + - grafana-data:/var/lib/grafana + - ./docs/operations/dashboards:/var/lib/grafana/dashboards:ro + - ./docs/operations/grafana-provisioning:/etc/grafana/provisioning:ro + networks: + - qpq + depends_on: + - prometheus diff --git a/docs/operations/backup-restore.md b/docs/operations/backup-restore.md new file mode 100644 index 0000000..a17e56e --- /dev/null +++ b/docs/operations/backup-restore.md @@ -0,0 +1,199 @@ +# Backup and Restore Procedures + +This document covers backup and restore for all quicproquo server data stores. + +## Data Inventory + +| Data | Location | Backend | Contains | +|------|----------|---------|----------| +| SQLCipher DB | `QPQ_DB_PATH` (default `data/qpq.db`) | `store_backend=sql` | Users, key packages, delivery queues, sessions, KT log, OPAQUE setup, blobs metadata, moderation | +| File store | `QPQ_DATA_DIR` (default `data/`) | `store_backend=file` | Bincode-serialized key packages, delivery queues, server state | +| Blob storage | `QPQ_DATA_DIR/blobs/` | Filesystem | Uploaded file transfer blobs | +| TLS certificates | `QPQ_TLS_CERT`, `QPQ_TLS_KEY` | DER files | Server identity | +| OPAQUE ServerSetup | Inside DB or file store | Persisted | OPAQUE credential state (critical for auth) | +| Server signing key | Inside DB or file store | Persisted | Ed25519 key for delivery proofs | +| KT Merkle log | Inside DB or file store | Persisted | Key transparency audit log | + +## SQLCipher Backup + +### Hot Backup (Online) + +SQLCipher supports the `.backup` command while the server is running (WAL mode allows concurrent readers). + +```bash +# 1. Open the encrypted database with the same key +sqlite3 data/qpq.db + +# 2. At the sqlite3 prompt, set the encryption key +PRAGMA key = 'your-db-key-here'; + +# 3. Perform an online backup +.backup /backups/qpq-$(date +%Y%m%d-%H%M%S).db + +.quit +``` + +### Scripted Hot Backup + +```bash +#!/bin/bash +set -euo pipefail + +BACKUP_DIR="/backups/qpq" +DB_PATH="${QPQ_DB_PATH:-data/qpq.db}" +DB_KEY="${QPQ_DB_KEY}" +TIMESTAMP=$(date +%Y%m%d-%H%M%S) +BACKUP_FILE="${BACKUP_DIR}/qpq-${TIMESTAMP}.db" + +mkdir -p "$BACKUP_DIR" + +sqlite3 "$DB_PATH" </dev/null || true +cp data/qpq.db-shm /backups/ 2>/dev/null || true + +# 4. Restart the server +systemctl start qpq-server +``` + +## File Backend Backup + +When using `store_backend=file`, data is stored as bincode files under `QPQ_DATA_DIR`. + +```bash +# Full directory backup +tar czf /backups/qpq-data-$(date +%Y%m%d-%H%M%S).tar.gz \ + -C "$(dirname "${QPQ_DATA_DIR:-data}")" \ + "$(basename "${QPQ_DATA_DIR:-data}")" +``` + +## Blob Storage Backup + +Blobs are stored in `QPQ_DATA_DIR/blobs/`. These are immutable once written. + +```bash +# Incremental rsync (blobs are write-once, ideal for rsync) +rsync -av --progress data/blobs/ /backups/blobs/ +``` + +## TLS Certificate Backup + +```bash +# Back up TLS certificates (store separately from DB backups) +cp data/server-cert.der /backups/tls/server-cert.der +cp data/server-key.der /backups/tls/server-key.der + +# Federation certs (if federation is enabled) +cp data/federation-cert.der /backups/tls/federation-cert.der 2>/dev/null || true +cp data/federation-key.der /backups/tls/federation-key.der 2>/dev/null || true +cp data/federation-ca.der /backups/tls/federation-ca.der 2>/dev/null || true +``` + +## Restore Procedures + +### Restore SQLCipher Database + +```bash +# 1. Stop the server +systemctl stop qpq-server + +# 2. Move the current (corrupt/lost) database aside +mv data/qpq.db data/qpq.db.broken 2>/dev/null || true +rm -f data/qpq.db-wal data/qpq.db-shm + +# 3. Copy the backup in place +cp /backups/qpq-20260304.db data/qpq.db + +# 4. Verify integrity +sqlite3 data/qpq.db "PRAGMA key = '${QPQ_DB_KEY}'; PRAGMA integrity_check;" + +# 5. Start the server (migrations will apply automatically if needed) +systemctl start qpq-server +``` + +### Restore File Backend + +```bash +# 1. Stop the server +systemctl stop qpq-server + +# 2. Replace the data directory +mv data data.broken 2>/dev/null || true +tar xzf /backups/qpq-data-20260304.tar.gz -C . + +# 3. Restore TLS certs if not included in the data backup +cp /backups/tls/server-cert.der data/server-cert.der +cp /backups/tls/server-key.der data/server-key.der + +# 4. Start the server +systemctl start qpq-server +``` + +### Restore Blobs Only + +```bash +rsync -av /backups/blobs/ data/blobs/ +``` + +## Backup Schedule Recommendations + +| Frequency | What | Method | +|-----------|------|--------| +| Every 6 hours | SQLCipher database | Hot backup script via cron | +| Daily | File backend / full data dir | tar + offsite copy | +| Continuous | Blobs | rsync (incremental) | +| On change | TLS certificates | Manual + secret manager | + +## Cron Example + +```cron +# SQLCipher hot backup every 6 hours +0 */6 * * * /opt/qpq/scripts/backup-db.sh >> /var/log/qpq-backup.log 2>&1 + +# Full data directory daily at 02:00 +0 2 * * * tar czf /backups/qpq-data-$(date +\%Y\%m\%d).tar.gz -C /var/lib quicproquo + +# Blob sync every hour +0 * * * * rsync -a /var/lib/quicproquo/blobs/ /backups/blobs/ + +# Prune backups older than 30 days +0 3 * * 0 find /backups -name 'qpq-*' -mtime +30 -delete +``` + +## Verification + +Always verify backups after creation: + +```bash +# SQLCipher integrity check +sqlite3 /backups/qpq-latest.db \ + "PRAGMA key = '${QPQ_DB_KEY}'; PRAGMA integrity_check; SELECT count(*) FROM users;" + +# File backend: check the archive is valid +tar tzf /backups/qpq-data-latest.tar.gz > /dev/null + +# TLS cert: check it parses and is not expired +openssl x509 -inform DER -in /backups/tls/server-cert.der -noout -dates +``` diff --git a/docs/operations/dashboards/qpq-overview.json b/docs/operations/dashboards/qpq-overview.json new file mode 100644 index 0000000..39690b7 --- /dev/null +++ b/docs/operations/dashboards/qpq-overview.json @@ -0,0 +1,395 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + } + ], + "id": null, + "uid": "qpq-overview", + "title": "quicproquo Server Overview", + "description": "Operational dashboard for quicproquo server instances", + "tags": ["quicproquo", "qpq"], + "timezone": "browser", + "editable": true, + "graphTooltip": 1, + "refresh": "10s", + "time": { + "from": "now-1h", + "to": "now" + }, + "panels": [ + { + "title": "Server Status", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 }, + "targets": [ + { + "expr": "up{job=\"qpq-server\"}", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "mappings": [ + { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" }, "1": { "text": "UP", "color": "green" } } } + ], + "thresholds": { + "steps": [ + { "value": null, "color": "red" }, + { "value": 1, "color": "green" } + ] + } + } + } + }, + { + "title": "Enqueue Rate", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 }, + "targets": [ + { + "expr": "rate(enqueue_total[5m])", + "legendFormat": "msgs/sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "thresholds": { + "steps": [ + { "value": null, "color": "green" }, + { "value": 100, "color": "yellow" }, + { "value": 500, "color": "red" } + ] + } + } + } + }, + { + "title": "Fetch Rate", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 }, + "targets": [ + { + "expr": "rate(fetch_total[5m])", + "legendFormat": "fetches/sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "thresholds": { + "steps": [ + { "value": null, "color": "green" } + ] + } + } + } + }, + { + "title": "Auth Success Rate", + "type": "gauge", + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 0 }, + "targets": [ + { + "expr": "rate(auth_login_success_total[5m]) / (rate(auth_login_success_total[5m]) + rate(auth_login_failure_total[5m]))", + "legendFormat": "success ratio" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "thresholds": { + "steps": [ + { "value": null, "color": "red" }, + { "value": 0.5, "color": "yellow" }, + { "value": 0.9, "color": "green" } + ] + } + } + } + }, + { + "title": "Delivery Queue Depth", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 0 }, + "targets": [ + { + "expr": "delivery_queue_depth", + "legendFormat": "depth" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { "value": null, "color": "green" }, + { "value": 10000, "color": "yellow" }, + { "value": 100000, "color": "red" } + ] + } + } + } + }, + { + "title": "Rate Limit Hits", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 0 }, + "targets": [ + { + "expr": "rate(rate_limit_hit_total[5m])", + "legendFormat": "hits/sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "thresholds": { + "steps": [ + { "value": null, "color": "green" }, + { "value": 1, "color": "yellow" }, + { "value": 10, "color": "red" } + ] + } + } + } + }, + { + "title": "Message Throughput", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "targets": [ + { + "expr": "rate(enqueue_total[5m])", + "legendFormat": "enqueue rate" + }, + { + "expr": "rate(fetch_total[5m])", + "legendFormat": "fetch rate" + }, + { + "expr": "rate(fetch_wait_total[5m])", + "legendFormat": "fetch_wait rate" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10 + } + } + } + }, + { + "title": "Enqueue Bandwidth", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "targets": [ + { + "expr": "rate(enqueue_bytes_total[5m])", + "legendFormat": "bytes/sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10, + "gradientMode": "scheme" + } + } + } + }, + { + "title": "Authentication", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "targets": [ + { + "expr": "rate(auth_login_success_total[5m])", + "legendFormat": "success/sec" + }, + { + "expr": "rate(auth_login_failure_total[5m])", + "legendFormat": "failure/sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10 + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "failure/sec" }, + "properties": [ + { "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } } + ] + } + ] + } + }, + { + "title": "Delivery Queue Depth Over Time", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, + "targets": [ + { + "expr": "delivery_queue_depth", + "legendFormat": "queue depth" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 20, + "gradientMode": "scheme", + "thresholdsStyle": { "mode": "area" } + }, + "thresholds": { + "steps": [ + { "value": null, "color": "green" }, + { "value": 10000, "color": "yellow" }, + { "value": 100000, "color": "red" } + ] + } + } + } + }, + { + "title": "Rate Limiting & Key Packages", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 }, + "targets": [ + { + "expr": "rate(rate_limit_hit_total[5m])", + "legendFormat": "rate limit hits/sec" + }, + { + "expr": "rate(key_package_upload_total[5m])", + "legendFormat": "key package uploads/sec" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10 + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "rate limit hits/sec" }, + "properties": [ + { "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } } + ] + } + ] + } + }, + { + "title": "Cumulative Totals", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 }, + "targets": [ + { + "expr": "enqueue_total", + "legendFormat": "total enqueued" + }, + { + "expr": "fetch_total", + "legendFormat": "total fetched" + }, + { + "expr": "auth_login_success_total", + "legendFormat": "total logins" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 5 + } + } + } + } + ], + "templating": { + "list": [ + { + "name": "datasource", + "type": "datasource", + "query": "prometheus", + "current": {}, + "hide": 0 + } + ] + }, + "annotations": { + "list": [ + { + "name": "Deploys", + "datasource": "-- Grafana --", + "enable": true, + "iconColor": "blue", + "tags": ["deploy"] + } + ] + }, + "schemaVersion": 38, + "version": 1 +} diff --git a/docs/operations/grafana-provisioning/dashboards/default.yml b/docs/operations/grafana-provisioning/dashboards/default.yml new file mode 100644 index 0000000..d253e67 --- /dev/null +++ b/docs/operations/grafana-provisioning/dashboards/default.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: 'default' + orgId: 1 + folder: 'quicproquo' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/docs/operations/grafana-provisioning/datasources/prometheus.yml b/docs/operations/grafana-provisioning/datasources/prometheus.yml new file mode 100644 index 0000000..bb009bb --- /dev/null +++ b/docs/operations/grafana-provisioning/datasources/prometheus.yml @@ -0,0 +1,9 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false diff --git a/docs/operations/incident-response.md b/docs/operations/incident-response.md new file mode 100644 index 0000000..d3da08b --- /dev/null +++ b/docs/operations/incident-response.md @@ -0,0 +1,338 @@ +# Incident Response Playbook + +This document provides procedures for responding to common operational incidents in a quicproquo deployment. + +## Severity Levels + +| Level | Description | Response Time | Examples | +|-------|-------------|---------------|----------| +| P1 - Critical | Service down, data loss, key compromise | Immediate | Server crash loop, DB corruption, leaked secrets | +| P2 - Major | Degraded service, partial outage | 15 minutes | High latency, storage full, cert expiry | +| P3 - Minor | Non-critical issue, monitoring alert | 1 hour | Rate limit spikes, non-critical warnings | + +## Incident: Server Not Starting + +### Symptoms +- Server process exits immediately +- Logs show "TLS cert or key missing" or "production forbids" errors + +### Diagnosis + +```bash +# Check server logs +journalctl -u qpq-server --since "10 min ago" --no-pager + +# Docker +docker compose logs --tail=50 server +``` + +### Common Causes and Fixes + +**Missing TLS certificates (production mode)** +```bash +# Production requires pre-existing certs (no auto-generation) +ls -la data/server-cert.der data/server-key.der + +# If missing, restore from backup or generate new ones +# See: key-rotation.md +``` + +**Missing auth token (production mode)** +```bash +# Production requires QPQ_AUTH_TOKEN >= 16 chars, not "devtoken" +echo $QPQ_AUTH_TOKEN | wc -c +``` + +**Database locked or corrupt** +```bash +# Check if another process holds the database +fuser data/qpq.db + +# Verify database integrity +sqlite3 data/qpq.db "PRAGMA key='${QPQ_DB_KEY}'; PRAGMA integrity_check;" +``` + +**Port already in use** +```bash +# Check if something is already listening on port 7000 +ss -tlnp | grep 7000 +``` + +## Incident: Node Down / Unresponsive + +### Symptoms +- Clients cannot connect +- Health check failures +- No new log entries + +### Diagnosis + +```bash +# 1. Check if the process is running +systemctl status qpq-server +# or: docker compose ps + +# 2. Check resource usage +top -bn1 | grep qpq +df -h /var/lib/quicproquo +free -h + +# 3. Check QUIC port is reachable +# From another host: +nc -uzv 7000 + +# 4. Check for OOM kills +dmesg | grep -i "out of memory\|oom" | tail -5 +journalctl -k | grep -i oom +``` + +### Recovery + +```bash +# Restart the service +systemctl restart qpq-server + +# If OOM: increase memory limit +systemctl edit qpq-server --force +# MemoryMax=2G + +# If disk full: see "Storage Full" incident below +``` + +## Incident: Storage Full + +### Symptoms +- `enqueue` operations fail +- Logs show "No space left on device" +- `delivery_queue_depth` gauge rising + +### Diagnosis + +```bash +# Check disk usage +df -h /var/lib/quicproquo +du -sh /var/lib/quicproquo/* + +# Check largest files +du -a /var/lib/quicproquo | sort -rn | head -20 + +# Check blob storage specifically +du -sh /var/lib/quicproquo/blobs/ +find /var/lib/quicproquo/blobs/ -type f | wc -l +``` + +### Recovery + +```bash +# 1. Identify and remove expired messages (the cleanup task handles this, +# but if it's behind, you can trigger manual cleanup) + +# For SQL backend: delete expired delivery messages +sqlite3 data/qpq.db <<'EOF' +PRAGMA key = '${QPQ_DB_KEY}'; +DELETE FROM delivery_queue WHERE expires_at IS NOT NULL AND expires_at < unixepoch(); +VACUUM; +EOF + +# 2. Remove orphaned blobs (blobs not referenced by any message) +# This is application-specific; coordinate with the codebase + +# 3. If the data partition is full, expand the volume +# AWS EBS: aws ec2 modify-volume --volume-id vol-xxx --size 100 +# Then: resize2fs /dev/xvdf + +# 4. Move to a larger disk +systemctl stop qpq-server +rsync -av /var/lib/quicproquo/ /mnt/new-volume/quicproquo/ +# Update QPQ_DATA_DIR and QPQ_DB_PATH to point to the new location +systemctl start qpq-server +``` + +### Prevention + +- Set up disk usage alerts at 70% and 90% thresholds. +- Configure message TTL (`ttl_secs`) to auto-expire old messages. +- Schedule regular `VACUUM` on the SQLCipher database. + +## Incident: DDoS / Connection Flood + +### Symptoms +- `rate_limit_hit_total` counter spiking +- `auth_login_failure_total` counter spiking +- High CPU usage +- Legitimate clients cannot connect + +### Diagnosis + +```bash +# Check connection rate limit hits in metrics +curl -s http://localhost:9090/metrics | grep rate_limit + +# Check auth failure rate +curl -s http://localhost:9090/metrics | grep auth_login_failure + +# Check active connections (QUIC uses UDP) +ss -unp | grep 7000 | wc -l +``` + +### Mitigation + +```bash +# 1. The server has built-in per-IP connection rate limiting. +# Check the logs for "connection rate limit exceeded" messages. + +# 2. Block offending IPs at the firewall level +iptables -A INPUT -s -j DROP + +# 3. For volumetric attacks, use upstream DDoS protection +# (Cloudflare Spectrum, AWS Shield, etc.) + +# 4. If the server is overwhelmed, restart to clear state +systemctl restart qpq-server + +# 5. Enable log redaction to reduce I/O pressure during attacks +# Set QPQ_REDACT_LOGS=true +``` + +## Incident: Key Compromise + +### Auth Token Compromised + +**Severity: P1** + +```bash +# 1. Immediately rotate the auth token +NEW_TOKEN=$(openssl rand -base64 32) + +# 2. Update server config and restart +# See: key-rotation.md "Auth Token Rotation" + +# 3. Notify all legitimate clients of the new token + +# 4. Review logs for unauthorized access +journalctl -u qpq-server | grep "auth_login_success" | tail -100 +``` + +### TLS Private Key Compromised + +**Severity: P1** + +```bash +# 1. Generate and install a new certificate immediately +# See: key-rotation.md "TLS Certificate Rotation" + +# 2. Revoke the compromised certificate with your CA +# (procedure depends on your CA) + +# 3. Restart the server with the new certificate +systemctl restart qpq-server + +# 4. If clients pin certificates, notify them of the change +``` + +### Database Key Compromised + +**Severity: P1** + +```bash +# 1. Stop the server +systemctl stop qpq-server + +# 2. Rekey the database immediately +# See: key-rotation.md "Database Encryption Key Rotation" + +# 3. Assess data exposure +# If the attacker had access to the database file, assume all +# stored data (users, key packages, delivery queues) is compromised. + +# 4. Consider notifying affected users +``` + +### OPAQUE ServerSetup Compromised + +**Severity: P1** + +```bash +# 1. Rotate the OPAQUE ServerSetup +# See: key-rotation.md "OPAQUE ServerSetup Rotation" + +# WARNING: This invalidates ALL OPAQUE credentials. +# All users must re-register. + +# 2. All users must re-register with new credentials +# 3. Review logs for unauthorized OPAQUE authentications +``` + +## Incident: High Latency + +### Symptoms +- Clients report slow message delivery +- `delivery_queue_depth` gauge is high +- Fetch operations are slow + +### Diagnosis + +```bash +# 1. Check system resources +top -bn1 | head -20 +iostat -x 1 3 + +# 2. Check database performance +sqlite3 data/qpq.db <<'EOF' +PRAGMA key = '${QPQ_DB_KEY}'; +PRAGMA integrity_check; +PRAGMA wal_checkpoint(PASSIVE); +-- Check table sizes +SELECT 'delivery_queue', count(*) FROM delivery_queue +UNION ALL SELECT 'key_packages', count(*) FROM key_packages +UNION ALL SELECT 'users', count(*) FROM users; +EOF + +# 3. Check queue depth via metrics +curl -s http://localhost:9090/metrics | grep delivery_queue_depth +``` + +### Recovery + +```bash +# 1. Checkpoint the WAL (reduces WAL file size) +sqlite3 data/qpq.db "PRAGMA key='${QPQ_DB_KEY}'; PRAGMA wal_checkpoint(TRUNCATE);" + +# 2. VACUUM to reclaim space and defragment +sqlite3 data/qpq.db "PRAGMA key='${QPQ_DB_KEY}'; VACUUM;" + +# 3. If the queue is huge, check for clients not fetching +# (delivery_queue rows accumulate when clients are offline) + +# 4. If I/O-bound: move database to faster storage (SSD/NVMe) +``` + +## Incident: Certificate Expiring + +### Symptoms +- Log warning: "TLS certificate expires within 30 days" +- Monitoring alert on certificate expiry + +### Response + +```bash +# 1. Check current certificate expiry +openssl x509 -inform DER -in data/server-cert.der -noout -enddate + +# 2. Renew the certificate +# See: key-rotation.md "TLS Certificate Rotation" + +# 3. Verify the new certificate is loaded +journalctl -u qpq-server --since "1 min ago" | grep -i cert +``` + +## Post-Incident Checklist + +After resolving any incident: + +1. **Document** the incident: timeline, root cause, resolution steps +2. **Verify** the service is fully operational (check metrics, test client connections) +3. **Review** whether monitoring would have caught this earlier +4. **Update** alerts and thresholds based on findings +5. **Communicate** with affected users if there was data exposure or service disruption +6. **Schedule** follow-up actions (e.g., add monitoring, improve automation) diff --git a/docs/operations/key-rotation.md b/docs/operations/key-rotation.md new file mode 100644 index 0000000..ba7e105 --- /dev/null +++ b/docs/operations/key-rotation.md @@ -0,0 +1,250 @@ +# Key Rotation Procedures + +This document provides step-by-step procedures for rotating all cryptographic material in a quicproquo deployment. + +## Auth Token Rotation + +The auth token (`QPQ_AUTH_TOKEN`) is used for bearer-token authentication (auth version 1). OPAQUE-authenticated sessions are not affected by token rotation. + +### Procedure + +```bash +# 1. Generate a new token (minimum 16 characters for production) +NEW_TOKEN=$(openssl rand -base64 32) +echo "New token: $NEW_TOKEN" + +# 2. Update the config file or environment +# Option A: TOML config file +sed -i "s/^auth_token = .*/auth_token = \"$NEW_TOKEN\"/" qpq-server.toml + +# Option B: Environment variable (systemd) +systemctl edit qpq-server --force +# Add: Environment=QPQ_AUTH_TOKEN= + +# Option C: Docker Compose +# Update QPQ_AUTH_TOKEN in docker-compose.prod.yml or .env file + +# 3. Restart the server +systemctl restart qpq-server +# or: docker compose restart server + +# 4. Update all clients with the new token +# Clients using OPAQUE auth are unaffected. +# Clients using bearer-token auth must update their QPQ_ACCESS_TOKEN. +``` + +### Impact + +- Active bearer-token sessions continue until they expire (sessions are in-memory). +- New bearer-token connections must use the new token. +- OPAQUE-authenticated clients are not affected. + +## TLS Certificate Rotation + +The server uses DER-encoded X.509 certificates for QUIC TLS 1.3. The server validates certificates at startup and warns if expiry is within 30 days. + +### Procedure + +```bash +# 1. Obtain a new certificate (example with Let's Encrypt / certbot) +certbot certonly --standalone -d chat.example.com + +# 2. Convert PEM to DER format (qpq-server expects DER) +openssl x509 -in /etc/letsencrypt/live/chat.example.com/fullchain.pem \ + -outform DER -out /tmp/server-cert.der + +openssl pkey -in /etc/letsencrypt/live/chat.example.com/privkey.pem \ + -outform DER -out /tmp/server-key.der + +# 3. Set restrictive permissions on the private key +chmod 600 /tmp/server-key.der + +# 4. Back up the current certificates +cp data/server-cert.der data/server-cert.der.bak +cp data/server-key.der data/server-key.der.bak + +# 5. Replace certificates +cp /tmp/server-cert.der data/server-cert.der +cp /tmp/server-key.der data/server-key.der + +# 6. Verify the new certificate +openssl x509 -inform DER -in data/server-cert.der -noout -text | head -20 + +# 7. Restart the server (QUIC requires restart for new TLS config) +systemctl restart qpq-server + +# 8. Verify the server started with the new certificate +journalctl -u qpq-server --since "1 min ago" | grep -i tls +``` + +### Self-Signed Certificate (Development) + +In non-production mode, the server auto-generates a self-signed certificate if none exists. To force regeneration: + +```bash +rm data/server-cert.der data/server-key.der +systemctl restart qpq-server +# Server will generate a new self-signed cert for localhost/127.0.0.1/::1 +``` + +### Automated Renewal with Certbot + +```bash +#!/bin/bash +# /opt/qpq/scripts/renew-cert.sh +set -euo pipefail + +DOMAIN="chat.example.com" +CERT_DIR="/etc/letsencrypt/live/$DOMAIN" +QPQ_DATA="/var/lib/quicproquo" + +certbot renew --quiet + +openssl x509 -in "$CERT_DIR/fullchain.pem" -outform DER -out "$QPQ_DATA/server-cert.der" +openssl pkey -in "$CERT_DIR/privkey.pem" -outform DER -out "$QPQ_DATA/server-key.der" +chmod 600 "$QPQ_DATA/server-key.der" +chown qpq:qpq "$QPQ_DATA/server-cert.der" "$QPQ_DATA/server-key.der" + +systemctl restart qpq-server +``` + +```cron +# Run cert renewal check twice daily +0 3,15 * * * /opt/qpq/scripts/renew-cert.sh >> /var/log/qpq-cert-renew.log 2>&1 +``` + +## Federation Certificate Rotation + +Federation uses mutual TLS (mTLS) with a shared CA for server-to-server authentication. + +### Procedure + +```bash +# 1. Generate a new federation certificate signed by the federation CA +openssl req -new -nodes -keyout /tmp/federation-key.pem \ + -out /tmp/federation.csr -subj "/CN=chat.example.com" + +openssl x509 -req -in /tmp/federation.csr \ + -CA federation-ca.pem -CAkey federation-ca-key.pem \ + -CAcreateserial -days 365 -out /tmp/federation-cert.pem + +# 2. Convert to DER +openssl x509 -in /tmp/federation-cert.pem -outform DER -out data/federation-cert.der +openssl pkey -in /tmp/federation-key.pem -outform DER -out data/federation-key.der +chmod 600 data/federation-key.der + +# 3. Restart the server +systemctl restart qpq-server + +# 4. Coordinate with federation peers: they must trust the same CA +``` + +## Database Encryption Key Rotation + +The SQLCipher database key (`QPQ_DB_KEY`) encrypts all data at rest. + +### Procedure (SQLCipher PRAGMA rekey) + +```bash +# 1. Stop the server +systemctl stop qpq-server + +# 2. Back up the database +cp data/qpq.db /backups/qpq-pre-rekey-$(date +%Y%m%d).db + +# 3. Rekey the database +sqlite3 data/qpq.db <> .env + +# 6. Start the server +systemctl start qpq-server +``` + +### Full Re-encryption (Alternative) + +If `PRAGMA rekey` is unavailable or you want a fresh database file: + +```bash +# 1. Stop the server and back up +systemctl stop qpq-server +cp data/qpq.db /backups/qpq-pre-rekey.db + +# 2. Export with old key, import with new key +sqlite3 data/qpq.db "PRAGMA key='old-key'; .dump" | \ + sqlite3 data/qpq-new.db "PRAGMA key='new-key'; .read /dev/stdin" + +# 3. Replace the database +mv data/qpq-new.db data/qpq.db + +# 4. Update config and restart +systemctl start qpq-server +``` + +## OPAQUE ServerSetup Rotation + +The OPAQUE ServerSetup is generated once and persisted. Rotating it invalidates all registered OPAQUE credentials. + +**WARNING: Rotating the OPAQUE ServerSetup requires all users to re-register. Only do this if the setup is compromised.** + +```bash +# 1. Stop the server +systemctl stop qpq-server + +# 2. Back up the database +cp data/qpq.db /backups/qpq-pre-opaque-rotate.db + +# 3. Delete the persisted OPAQUE setup +# For SQL backend: +sqlite3 data/qpq.db "PRAGMA key='${QPQ_DB_KEY}'; DELETE FROM server_state WHERE key = 'opaque_setup';" + +# For file backend: +rm data/opaque_setup.bin 2>/dev/null || true + +# 4. Start the server (it will generate a new OPAQUE ServerSetup) +systemctl start qpq-server + +# 5. All users must re-register (existing OPAQUE credentials are invalid) +``` + +## Server Signing Key Rotation + +The Ed25519 signing key is used for delivery proofs. Rotating it means old delivery proofs cannot be verified against the new key. + +```bash +# 1. Stop the server +systemctl stop qpq-server + +# 2. Back up +cp data/qpq.db /backups/qpq-pre-sigkey-rotate.db + +# 3. Delete the persisted signing key seed +# For SQL backend: +sqlite3 data/qpq.db "PRAGMA key='${QPQ_DB_KEY}'; DELETE FROM server_state WHERE key = 'signing_key_seed';" + +# 4. Start the server (generates a new Ed25519 signing key) +systemctl start qpq-server +``` + +## Rotation Schedule + +| Key Material | Rotation Frequency | Impact | +|---|---|---| +| Auth token | Quarterly or on compromise | Clients using bearer auth must update | +| TLS certificate | Before expiry (automate with certbot) | Server restart required | +| Federation cert | Annually or before expiry | Coordinate with peers | +| DB encryption key | Annually or on compromise | Server downtime required | +| OPAQUE ServerSetup | Only on compromise | All users must re-register | +| Server signing key | Only on compromise | Old delivery proofs unverifiable | diff --git a/docs/operations/monitoring.md b/docs/operations/monitoring.md new file mode 100644 index 0000000..a6e4ec0 --- /dev/null +++ b/docs/operations/monitoring.md @@ -0,0 +1,225 @@ +# Monitoring Guide + +This document covers metrics collection, alerting, and dashboards for quicproquo. + +## Enabling Metrics + +The server exports Prometheus metrics via HTTP when configured: + +```bash +# Environment variables +QPQ_METRICS_LISTEN=0.0.0.0:9090 +QPQ_METRICS_ENABLED=true + +# Or in qpq-server.toml +metrics_listen = "0.0.0.0:9090" +metrics_enabled = true +``` + +Metrics are served at `http:///metrics` in Prometheus exposition format. + +## Available Metrics + +### Counters + +| Metric | Description | Labels | +|--------|-------------|--------| +| `enqueue_total` | Total messages enqueued | - | +| `enqueue_bytes_total` | Total bytes enqueued | - | +| `fetch_total` | Total message fetches completed | - | +| `fetch_wait_total` | Total long-poll fetch waits | - | +| `key_package_upload_total` | Total MLS key package uploads | - | +| `auth_login_success_total` | Successful OPAQUE login completions | - | +| `auth_login_failure_total` | Failed login attempts | - | +| `rate_limit_hit_total` | Rate limit rejections | - | + +### Gauges + +| Metric | Description | +|--------|-------------| +| `delivery_queue_depth` | Current delivery queue depth (sampled) | + +## Prometheus Configuration + +```yaml +# prometheus.yml +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: 'qpq-server' + static_targets: + - targets: ['qpq-server:9090'] + scrape_interval: 10s +``` + +## Alert Rules + +```yaml +# prometheus-alerts.yml +groups: + - name: qpq-server + rules: + # Server down + - alert: QpqServerDown + expr: up{job="qpq-server"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "qpq-server is down" + description: "Prometheus cannot scrape qpq-server metrics for > 1 minute." + + # High auth failure rate (potential brute force) + - alert: QpqHighAuthFailureRate + expr: rate(auth_login_failure_total[5m]) > 10 + for: 2m + labels: + severity: warning + annotations: + summary: "High authentication failure rate" + description: "{{ $value | printf \"%.1f\" }} auth failures/sec over 5 minutes." + + # Rate limiting active + - alert: QpqRateLimitActive + expr: rate(rate_limit_hit_total[5m]) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Rate limiting is actively rejecting requests" + description: "{{ $value | printf \"%.1f\" }} rate limit hits/sec." + + # Delivery queue growing + - alert: QpqDeliveryQueueHigh + expr: delivery_queue_depth > 10000 + for: 10m + labels: + severity: warning + annotations: + summary: "Delivery queue depth is high" + description: "Queue depth: {{ $value }}. Clients may not be fetching." + + - alert: QpqDeliveryQueueCritical + expr: delivery_queue_depth > 100000 + for: 5m + labels: + severity: critical + annotations: + summary: "Delivery queue depth is critical" + description: "Queue depth: {{ $value }}. Investigate immediately." + + # No enqueue activity (service may be stuck) + - alert: QpqNoEnqueueActivity + expr: rate(enqueue_total[15m]) == 0 + for: 30m + labels: + severity: warning + annotations: + summary: "No messages enqueued in 30 minutes" + description: "Check if the service is accepting connections." + + # Auth success ratio too low + - alert: QpqLowAuthSuccessRatio + expr: > + rate(auth_login_success_total[5m]) + / (rate(auth_login_success_total[5m]) + rate(auth_login_failure_total[5m])) + < 0.5 + for: 10m + labels: + severity: warning + annotations: + summary: "Auth success ratio below 50%" + description: "More than half of login attempts are failing." +``` + +## Key Dashboard Panels + +See `dashboards/qpq-overview.json` for the full Grafana dashboard. Key panels: + +### Message Throughput +- **Enqueue rate**: `rate(enqueue_total[5m])` +- **Fetch rate**: `rate(fetch_total[5m])` +- **Enqueue bandwidth**: `rate(enqueue_bytes_total[5m])` + +### Authentication +- **Login success rate**: `rate(auth_login_success_total[5m])` +- **Login failure rate**: `rate(auth_login_failure_total[5m])` +- **Success ratio**: `rate(auth_login_success_total[5m]) / (rate(auth_login_success_total[5m]) + rate(auth_login_failure_total[5m]))` + +### Delivery Queue +- **Queue depth**: `delivery_queue_depth` +- **Queue growth rate**: `deriv(delivery_queue_depth[10m])` + +### Rate Limiting +- **Rate limit hits**: `rate(rate_limit_hit_total[5m])` + +### Infrastructure (Node Exporter) +- CPU, memory, disk, network from `node_exporter` + +## Grafana Dashboard + +Import the dashboard from `dashboards/qpq-overview.json`: + +1. Open Grafana -> Dashboards -> Import +2. Upload `docs/operations/dashboards/qpq-overview.json` +3. Select your Prometheus data source +4. Save + +## Log Monitoring + +The server uses `tracing` with `RUST_LOG` environment variable: + +```bash +# Production: info level with structured JSON output +RUST_LOG=info + +# Debug specific modules +RUST_LOG=info,quicproquo_server::node_service=debug + +# Verbose debugging +RUST_LOG=debug +``` + +### Key Log Messages to Monitor + +| Log Pattern | Meaning | Action | +|-------------|---------|--------| +| `"TLS certificate expires within 30 days"` | Cert expiring soon | Rotate certificate | +| `"TLS certificate is self-signed"` | Self-signed cert in use | Replace with CA-signed cert in production | +| `"connection rate limit exceeded"` | IP being rate limited | Check for DDoS | +| `"running without QPQ_AUTH_TOKEN"` | Insecure mode | Must not appear in production | +| `"db_key is empty; SQL store will be plaintext"` | Unencrypted DB | Must not appear in production | +| `"shutdown signal received"` | Graceful shutdown started | Expected during deploys | +| `"generated and persisted new OPAQUE ServerSetup"` | Fresh OPAQUE setup | Expected on first start only | + +### Log Aggregation + +For production, pipe logs to a log aggregator: + +```bash +# Systemd -> journald -> Loki/Elasticsearch +journalctl -u qpq-server -f --output=json | \ + promtail --stdin --client.url=http://loki:3100/loki/api/v1/push + +# Docker -> Loki driver +docker run --log-driver=loki \ + --log-opt loki-url="http://loki:3100/loki/api/v1/push" \ + qpq-server +``` + +## Health Checking + +The Docker image includes a basic health check (TLS cert file exists). For deeper health checks: + +```bash +# Simple: check the process is running and port is open +ss -ulnp | grep 7000 + +# Metrics endpoint (if enabled) +curl -sf http://localhost:9090/metrics > /dev/null + +# Full client connection test +qpq-client --server 127.0.0.1:7000 --auth-token "$TOKEN" --ping +``` diff --git a/docs/operations/prometheus-alerts.yml b/docs/operations/prometheus-alerts.yml new file mode 100644 index 0000000..8f2ae2b --- /dev/null +++ b/docs/operations/prometheus-alerts.yml @@ -0,0 +1,55 @@ +groups: + - name: qpq-server + rules: + - alert: QpqServerDown + expr: up{job="qpq-server"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "qpq-server is down" + description: "Prometheus cannot scrape qpq-server metrics for > 1 minute." + + - alert: QpqHighAuthFailureRate + expr: rate(auth_login_failure_total[5m]) > 10 + for: 2m + labels: + severity: warning + annotations: + summary: "High authentication failure rate" + description: "{{ $value | printf \"%.1f\" }} auth failures/sec over 5 minutes." + + - alert: QpqRateLimitActive + expr: rate(rate_limit_hit_total[5m]) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Rate limiting is actively rejecting requests" + + - alert: QpqDeliveryQueueHigh + expr: delivery_queue_depth > 10000 + for: 10m + labels: + severity: warning + annotations: + summary: "Delivery queue depth is high ({{ $value }})" + + - alert: QpqDeliveryQueueCritical + expr: delivery_queue_depth > 100000 + for: 5m + labels: + severity: critical + annotations: + summary: "Delivery queue depth is critical ({{ $value }})" + + - alert: QpqLowAuthSuccessRatio + expr: > + rate(auth_login_success_total[5m]) + / (rate(auth_login_success_total[5m]) + rate(auth_login_failure_total[5m])) + < 0.5 + for: 10m + labels: + severity: warning + annotations: + summary: "Auth success ratio below 50%" diff --git a/docs/operations/prometheus.yml b/docs/operations/prometheus.yml new file mode 100644 index 0000000..b96636f --- /dev/null +++ b/docs/operations/prometheus.yml @@ -0,0 +1,12 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +rule_files: + - "alerts.yml" + +scrape_configs: + - job_name: 'qpq-server' + static_configs: + - targets: ['server:9090'] + scrape_interval: 10s diff --git a/docs/operations/scaling-guide.md b/docs/operations/scaling-guide.md new file mode 100644 index 0000000..3754853 --- /dev/null +++ b/docs/operations/scaling-guide.md @@ -0,0 +1,244 @@ +# Scaling Guide + +This document covers resource sizing, scaling triggers, and capacity planning for quicproquo deployments. + +## Architecture Overview + +quicproquo runs as a single-process server handling QUIC connections. Key resource consumers: + +- **CPU**: TLS 1.3 handshakes (QUIC), OPAQUE PAKE authentication, message routing +- **Memory**: In-memory session state (DashMap), QUIC connection state, delivery waiters, rate limit entries +- **Disk I/O**: SQLCipher reads/writes (WAL mode), blob storage, KT Merkle log +- **Network**: QUIC (UDP), metrics HTTP, optional WebSocket bridge + +## Single-Node Sizing + +### Minimum (Development / Small Team) + +| Resource | Value | +|----------|-------| +| CPU | 1 vCPU | +| Memory | 512 MB | +| Disk | 10 GB SSD | +| Network | 100 Mbps | + +Supports ~100 concurrent users, light message traffic. + +### Recommended (Production / Small-Medium) + +| Resource | Value | +|----------|-------| +| CPU | 2-4 vCPU | +| Memory | 2-4 GB | +| Disk | 50-100 GB NVMe SSD | +| Network | 1 Gbps | + +Supports ~1,000-5,000 concurrent users. + +### Large (High Traffic) + +| Resource | Value | +|----------|-------| +| CPU | 8+ vCPU | +| Memory | 8-16 GB | +| Disk | 500 GB+ NVMe SSD (RAID 10) | +| Network | 10 Gbps | + +Supports ~10,000+ concurrent users. + +## Scaling Triggers + +Monitor these metrics and scale when thresholds are exceeded: + +| Metric | Warning | Critical | Action | +|--------|---------|----------|--------| +| CPU usage | > 70% sustained (5 min) | > 90% sustained | Add CPU or scale horizontally | +| Memory usage | > 75% | > 90% | Increase memory, check for leaks | +| Disk usage | > 70% | > 90% | Expand volume, clean old data | +| Disk I/O latency | > 5 ms p95 | > 20 ms p95 | Move to faster storage | +| `delivery_queue_depth` | > 10,000 | > 100,000 | Investigate stale queues | +| `rate_limit_hit_total` rate | > 100/min | > 1000/min | Investigate abuse, adjust limits | +| `auth_login_failure_total` rate | > 50/min | > 500/min | Potential brute force attack | +| Connection count | > 80% of `max_concurrent_bidi_streams` | > 95% | Scale horizontally | +| TLS handshake latency | > 100 ms p95 | > 500 ms p95 | Add CPU, check network | + +## Vertical Scaling + +### CPU Scaling + +The server is async (Tokio) and benefits from multiple cores. QUIC TLS handshakes and OPAQUE computations are CPU-intensive. + +```bash +# Check current CPU usage +top -bn1 -p $(pgrep qpq-server) + +# For Docker: increase CPU limits +# docker-compose.prod.yml: +# deploy: +# resources: +# limits: +# cpus: '4' +``` + +### Memory Scaling + +In-memory state scales linearly with concurrent connections: +- ~2-5 KB per active QUIC connection (quinn state) +- ~200 bytes per session entry (DashMap) +- ~100 bytes per rate limit entry +- ~100 bytes per delivery waiter + +```bash +# Estimate memory for 10,000 connections: +# 10,000 * 5 KB = ~50 MB for connections +# 10,000 * 500 bytes = ~5 MB for sessions/rate limits +# SQLCipher connection pool: ~50 MB (4 connections, caches) +# Base process: ~30 MB +# Total: ~135 MB + headroom = 256-512 MB minimum +``` + +### Disk I/O Scaling + +SQLCipher uses WAL mode for concurrent reads. For write-heavy workloads: + +```bash +# Check current I/O +iostat -x 1 5 + +# Move to NVMe if on spinning disk +# Increase WAL autocheckpoint threshold for burst writes +sqlite3 data/qpq.db "PRAGMA key='${QPQ_DB_KEY}'; PRAGMA wal_autocheckpoint=2000;" +``` + +## Horizontal Scaling + +quicproquo does not yet have built-in multi-node clustering. For horizontal scaling, use these patterns: + +### Load Balancer (UDP/QUIC) + +Place a UDP load balancer in front of multiple qpq-server instances. Each instance runs independently with its own database. + +``` + +-----------+ + clients ------> | L4 LB | ----> qpq-server-1 (db-1) + | (UDP/QUIC)| ----> qpq-server-2 (db-2) + +-----------+ qpq-server-3 (db-3) +``` + +**Requirements:** +- Sticky sessions (by client IP or QUIC connection ID) so a client always reaches the same node +- Shared storage backend or federation between nodes + +### Federation for Multi-Node + +Enable federation to relay messages between nodes: + +```toml +# qpq-server.toml on node-1 +[federation] +enabled = true +domain = "node1.chat.example.com" +listen = "0.0.0.0:7001" +federation_cert = "data/federation-cert.der" +federation_key = "data/federation-key.der" +federation_ca = "data/federation-ca.der" + +[[federation.peers]] +domain = "node2.chat.example.com" +address = "10.0.1.2:7001" +``` + +### Shared Database (PostgreSQL) + +For true horizontal scaling, migrate from SQLCipher to a shared PostgreSQL instance. This is not yet implemented but is the planned approach for multi-node deployments. + +``` + qpq-server-1 --\ + qpq-server-2 ---+--> PostgreSQL (shared) + qpq-server-3 --/ +``` + +## Connection Tuning + +The server has these QUIC transport defaults: + +| Parameter | Default | Tunable | +|-----------|---------|---------| +| Max idle timeout | 300s (5 min) | Code change required | +| Max concurrent bidi streams | 1 per connection | Code change required | +| Max concurrent uni streams | 0 | Code change required | +| SQLCipher connection pool | 4 connections | Code change required | + +For high connection counts, consider: +- Increasing the OS file descriptor limit: `ulimit -n 65536` +- Increasing UDP buffer sizes: + +```bash +# /etc/sysctl.d/99-qpq.conf +net.core.rmem_max = 26214400 +net.core.wmem_max = 26214400 +net.core.rmem_default = 1048576 +net.core.wmem_default = 1048576 +``` + +```bash +sysctl -p /etc/sysctl.d/99-qpq.conf +``` + +## Docker Resource Limits + +```yaml +# docker-compose.prod.yml +services: + server: + deploy: + resources: + limits: + cpus: '4' + memory: 4G + reservations: + cpus: '2' + memory: 1G + ulimits: + nofile: + soft: 65536 + hard: 65536 +``` + +## Load Testing + +Use the included test infrastructure to benchmark: + +```bash +# Build the test client +cargo build --release --bin qpq-client + +# Run concurrent connection test (example) +for i in $(seq 1 100); do + qpq-client --server 127.0.0.1:7000 --auth-token "$QPQ_AUTH_TOKEN" & +done +wait + +# Monitor during load test +watch -n1 'curl -s http://localhost:9090/metrics | grep -E "enqueue_total|fetch_total|delivery_queue_depth|rate_limit"' +``` + +## Capacity Planning Worksheet + +| Parameter | Your Value | +|-----------|-----------| +| Expected concurrent users | | +| Messages per user per hour | | +| Average message size (bytes) | | +| Blob uploads per day | | +| Average blob size (MB) | | +| Data retention (days) | | + +**Formulas:** + +``` +Storage per day = (users * msgs/hr * 24 * avg_msg_size) + (blob_uploads * avg_blob_size) +DB growth per month = storage_per_day * 30 +Memory estimate = (concurrent_users * 5 KB) + 256 MB base +CPU estimate = 1 vCPU per ~2,500 concurrent connections (depends on message rate) +```