docs: add operational runbook, Grafana dashboard, and production docker-compose
Add comprehensive operational documentation: - docs/operations/backup-restore.md: SQLCipher, file backend, blob backup/restore - docs/operations/key-rotation.md: auth token, TLS, federation, DB key, OPAQUE rotation - docs/operations/incident-response.md: playbook for common incidents - docs/operations/scaling-guide.md: resource sizing, scaling triggers, capacity planning - docs/operations/monitoring.md: Prometheus metrics, alert rules, log monitoring - docs/operations/dashboards/qpq-overview.json: Grafana dashboard template - docs/operations/prometheus.yml + alerts: Prometheus scrape and alert config - docs/operations/grafana-provisioning/: auto-provisioning for datasources and dashboards - docker-compose.prod.yml: production stack (server + Prometheus + Grafana) - .env.example: documented environment variable template
This commit is contained in:
20
.env.example
Normal file
20
.env.example
Normal file
@@ -0,0 +1,20 @@
|
||||
# quicproquo Production Environment Variables
|
||||
# Copy this file to .env and fill in the values.
|
||||
|
||||
# Server auth token (required, >= 16 characters)
|
||||
QPQ_AUTH_TOKEN=
|
||||
|
||||
# SQLCipher database encryption key (required for store_backend=sql)
|
||||
QPQ_DB_KEY=
|
||||
|
||||
# Ports (defaults shown)
|
||||
QPQ_LISTEN_PORT=7000
|
||||
QPQ_WS_PORT=9000
|
||||
|
||||
# Optional features
|
||||
QPQ_SEALED_SENDER=false
|
||||
QPQ_REDACT_LOGS=true
|
||||
QPQ_WS_LISTEN=
|
||||
|
||||
# Grafana admin password
|
||||
GRAFANA_ADMIN_PASSWORD=changeme
|
||||
113
docker-compose.prod.yml
Normal file
113
docker-compose.prod.yml
Normal file
@@ -0,0 +1,113 @@
|
||||
# Production Docker Compose for quicproquo
|
||||
#
|
||||
# Usage:
|
||||
# 1. Copy .env.example to .env and fill in secrets
|
||||
# 2. Place TLS certificates in ./certs/
|
||||
# 3. docker compose -f docker-compose.prod.yml up -d
|
||||
#
|
||||
# Prerequisites:
|
||||
# - TLS certificate and key in DER format (no auto-generation in production)
|
||||
# - Strong auth token (>= 16 characters)
|
||||
# - Database encryption key
|
||||
|
||||
networks:
|
||||
qpq:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
qpq-data:
|
||||
prometheus-data:
|
||||
grafana-data:
|
||||
|
||||
services:
|
||||
# ── quicproquo server ────────────────────────────────────────────────────────
|
||||
server:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/Dockerfile
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "${QPQ_LISTEN_PORT:-7000}:7000/udp" # QUIC
|
||||
- "${QPQ_WS_PORT:-9000}:9000" # WebSocket bridge (optional)
|
||||
environment:
|
||||
RUST_LOG: info
|
||||
QPQ_PRODUCTION: "true"
|
||||
QPQ_LISTEN: "0.0.0.0:7000"
|
||||
QPQ_DATA_DIR: /var/lib/quicproquo
|
||||
QPQ_TLS_CERT: /var/lib/quicproquo/certs/server-cert.der
|
||||
QPQ_TLS_KEY: /var/lib/quicproquo/certs/server-key.der
|
||||
QPQ_AUTH_TOKEN: "${QPQ_AUTH_TOKEN}"
|
||||
QPQ_STORE_BACKEND: sql
|
||||
QPQ_DB_PATH: /var/lib/quicproquo/qpq.db
|
||||
QPQ_DB_KEY: "${QPQ_DB_KEY}"
|
||||
QPQ_METRICS_LISTEN: "0.0.0.0:9090"
|
||||
QPQ_METRICS_ENABLED: "true"
|
||||
QPQ_SEALED_SENDER: "${QPQ_SEALED_SENDER:-false}"
|
||||
QPQ_REDACT_LOGS: "${QPQ_REDACT_LOGS:-true}"
|
||||
QPQ_WS_LISTEN: "${QPQ_WS_LISTEN:-}"
|
||||
volumes:
|
||||
- qpq-data:/var/lib/quicproquo
|
||||
- ./certs:/var/lib/quicproquo/certs:ro
|
||||
networks:
|
||||
- qpq
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: '4'
|
||||
memory: 4G
|
||||
reservations:
|
||||
cpus: '2'
|
||||
memory: 1G
|
||||
ulimits:
|
||||
nofile:
|
||||
soft: 65536
|
||||
hard: 65536
|
||||
healthcheck:
|
||||
test: ["CMD", "test", "-f", "/var/lib/quicproquo/certs/server-cert.der"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 10s
|
||||
logging:
|
||||
driver: json-file
|
||||
options:
|
||||
max-size: "50m"
|
||||
max-file: "5"
|
||||
|
||||
# ── Prometheus ───────────────────────────────────────────────────────────────
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "127.0.0.1:9091:9090"
|
||||
volumes:
|
||||
- prometheus-data:/prometheus
|
||||
- ./docs/operations/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./docs/operations/prometheus-alerts.yml:/etc/prometheus/alerts.yml:ro
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
- '--web.enable-lifecycle'
|
||||
networks:
|
||||
- qpq
|
||||
depends_on:
|
||||
- server
|
||||
|
||||
# ── Grafana ──────────────────────────────────────────────────────────────────
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "127.0.0.1:3000:3000"
|
||||
environment:
|
||||
GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD:-changeme}"
|
||||
GF_USERS_ALLOW_SIGN_UP: "false"
|
||||
volumes:
|
||||
- grafana-data:/var/lib/grafana
|
||||
- ./docs/operations/dashboards:/var/lib/grafana/dashboards:ro
|
||||
- ./docs/operations/grafana-provisioning:/etc/grafana/provisioning:ro
|
||||
networks:
|
||||
- qpq
|
||||
depends_on:
|
||||
- prometheus
|
||||
199
docs/operations/backup-restore.md
Normal file
199
docs/operations/backup-restore.md
Normal file
@@ -0,0 +1,199 @@
|
||||
# Backup and Restore Procedures
|
||||
|
||||
This document covers backup and restore for all quicproquo server data stores.
|
||||
|
||||
## Data Inventory
|
||||
|
||||
| Data | Location | Backend | Contains |
|
||||
|------|----------|---------|----------|
|
||||
| SQLCipher DB | `QPQ_DB_PATH` (default `data/qpq.db`) | `store_backend=sql` | Users, key packages, delivery queues, sessions, KT log, OPAQUE setup, blobs metadata, moderation |
|
||||
| File store | `QPQ_DATA_DIR` (default `data/`) | `store_backend=file` | Bincode-serialized key packages, delivery queues, server state |
|
||||
| Blob storage | `QPQ_DATA_DIR/blobs/` | Filesystem | Uploaded file transfer blobs |
|
||||
| TLS certificates | `QPQ_TLS_CERT`, `QPQ_TLS_KEY` | DER files | Server identity |
|
||||
| OPAQUE ServerSetup | Inside DB or file store | Persisted | OPAQUE credential state (critical for auth) |
|
||||
| Server signing key | Inside DB or file store | Persisted | Ed25519 key for delivery proofs |
|
||||
| KT Merkle log | Inside DB or file store | Persisted | Key transparency audit log |
|
||||
|
||||
## SQLCipher Backup
|
||||
|
||||
### Hot Backup (Online)
|
||||
|
||||
SQLCipher supports the `.backup` command while the server is running (WAL mode allows concurrent readers).
|
||||
|
||||
```bash
|
||||
# 1. Open the encrypted database with the same key
|
||||
sqlite3 data/qpq.db
|
||||
|
||||
# 2. At the sqlite3 prompt, set the encryption key
|
||||
PRAGMA key = 'your-db-key-here';
|
||||
|
||||
# 3. Perform an online backup
|
||||
.backup /backups/qpq-$(date +%Y%m%d-%H%M%S).db
|
||||
|
||||
.quit
|
||||
```
|
||||
|
||||
### Scripted Hot Backup
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
BACKUP_DIR="/backups/qpq"
|
||||
DB_PATH="${QPQ_DB_PATH:-data/qpq.db}"
|
||||
DB_KEY="${QPQ_DB_KEY}"
|
||||
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
||||
BACKUP_FILE="${BACKUP_DIR}/qpq-${TIMESTAMP}.db"
|
||||
|
||||
mkdir -p "$BACKUP_DIR"
|
||||
|
||||
sqlite3 "$DB_PATH" <<EOF
|
||||
PRAGMA key = '${DB_KEY}';
|
||||
.backup ${BACKUP_FILE}
|
||||
EOF
|
||||
|
||||
# Verify the backup is readable
|
||||
sqlite3 "$BACKUP_FILE" "PRAGMA key = '${DB_KEY}'; PRAGMA integrity_check;" \
|
||||
| grep -q "ok" && echo "Backup verified: $BACKUP_FILE" \
|
||||
|| { echo "ERROR: backup verification failed"; exit 1; }
|
||||
|
||||
# Retain last 7 daily backups
|
||||
find "$BACKUP_DIR" -name 'qpq-*.db' -mtime +7 -delete
|
||||
```
|
||||
|
||||
### Cold Backup (Offline)
|
||||
|
||||
```bash
|
||||
# 1. Stop the server
|
||||
systemctl stop qpq-server # or docker compose stop server
|
||||
|
||||
# 2. Copy the database file
|
||||
cp data/qpq.db /backups/qpq-$(date +%Y%m%d).db
|
||||
|
||||
# 3. Copy the WAL and SHM files if they exist
|
||||
cp data/qpq.db-wal /backups/ 2>/dev/null || true
|
||||
cp data/qpq.db-shm /backups/ 2>/dev/null || true
|
||||
|
||||
# 4. Restart the server
|
||||
systemctl start qpq-server
|
||||
```
|
||||
|
||||
## File Backend Backup
|
||||
|
||||
When using `store_backend=file`, data is stored as bincode files under `QPQ_DATA_DIR`.
|
||||
|
||||
```bash
|
||||
# Full directory backup
|
||||
tar czf /backups/qpq-data-$(date +%Y%m%d-%H%M%S).tar.gz \
|
||||
-C "$(dirname "${QPQ_DATA_DIR:-data}")" \
|
||||
"$(basename "${QPQ_DATA_DIR:-data}")"
|
||||
```
|
||||
|
||||
## Blob Storage Backup
|
||||
|
||||
Blobs are stored in `QPQ_DATA_DIR/blobs/`. These are immutable once written.
|
||||
|
||||
```bash
|
||||
# Incremental rsync (blobs are write-once, ideal for rsync)
|
||||
rsync -av --progress data/blobs/ /backups/blobs/
|
||||
```
|
||||
|
||||
## TLS Certificate Backup
|
||||
|
||||
```bash
|
||||
# Back up TLS certificates (store separately from DB backups)
|
||||
cp data/server-cert.der /backups/tls/server-cert.der
|
||||
cp data/server-key.der /backups/tls/server-key.der
|
||||
|
||||
# Federation certs (if federation is enabled)
|
||||
cp data/federation-cert.der /backups/tls/federation-cert.der 2>/dev/null || true
|
||||
cp data/federation-key.der /backups/tls/federation-key.der 2>/dev/null || true
|
||||
cp data/federation-ca.der /backups/tls/federation-ca.der 2>/dev/null || true
|
||||
```
|
||||
|
||||
## Restore Procedures
|
||||
|
||||
### Restore SQLCipher Database
|
||||
|
||||
```bash
|
||||
# 1. Stop the server
|
||||
systemctl stop qpq-server
|
||||
|
||||
# 2. Move the current (corrupt/lost) database aside
|
||||
mv data/qpq.db data/qpq.db.broken 2>/dev/null || true
|
||||
rm -f data/qpq.db-wal data/qpq.db-shm
|
||||
|
||||
# 3. Copy the backup in place
|
||||
cp /backups/qpq-20260304.db data/qpq.db
|
||||
|
||||
# 4. Verify integrity
|
||||
sqlite3 data/qpq.db "PRAGMA key = '${QPQ_DB_KEY}'; PRAGMA integrity_check;"
|
||||
|
||||
# 5. Start the server (migrations will apply automatically if needed)
|
||||
systemctl start qpq-server
|
||||
```
|
||||
|
||||
### Restore File Backend
|
||||
|
||||
```bash
|
||||
# 1. Stop the server
|
||||
systemctl stop qpq-server
|
||||
|
||||
# 2. Replace the data directory
|
||||
mv data data.broken 2>/dev/null || true
|
||||
tar xzf /backups/qpq-data-20260304.tar.gz -C .
|
||||
|
||||
# 3. Restore TLS certs if not included in the data backup
|
||||
cp /backups/tls/server-cert.der data/server-cert.der
|
||||
cp /backups/tls/server-key.der data/server-key.der
|
||||
|
||||
# 4. Start the server
|
||||
systemctl start qpq-server
|
||||
```
|
||||
|
||||
### Restore Blobs Only
|
||||
|
||||
```bash
|
||||
rsync -av /backups/blobs/ data/blobs/
|
||||
```
|
||||
|
||||
## Backup Schedule Recommendations
|
||||
|
||||
| Frequency | What | Method |
|
||||
|-----------|------|--------|
|
||||
| Every 6 hours | SQLCipher database | Hot backup script via cron |
|
||||
| Daily | File backend / full data dir | tar + offsite copy |
|
||||
| Continuous | Blobs | rsync (incremental) |
|
||||
| On change | TLS certificates | Manual + secret manager |
|
||||
|
||||
## Cron Example
|
||||
|
||||
```cron
|
||||
# SQLCipher hot backup every 6 hours
|
||||
0 */6 * * * /opt/qpq/scripts/backup-db.sh >> /var/log/qpq-backup.log 2>&1
|
||||
|
||||
# Full data directory daily at 02:00
|
||||
0 2 * * * tar czf /backups/qpq-data-$(date +\%Y\%m\%d).tar.gz -C /var/lib quicproquo
|
||||
|
||||
# Blob sync every hour
|
||||
0 * * * * rsync -a /var/lib/quicproquo/blobs/ /backups/blobs/
|
||||
|
||||
# Prune backups older than 30 days
|
||||
0 3 * * 0 find /backups -name 'qpq-*' -mtime +30 -delete
|
||||
```
|
||||
|
||||
## Verification
|
||||
|
||||
Always verify backups after creation:
|
||||
|
||||
```bash
|
||||
# SQLCipher integrity check
|
||||
sqlite3 /backups/qpq-latest.db \
|
||||
"PRAGMA key = '${QPQ_DB_KEY}'; PRAGMA integrity_check; SELECT count(*) FROM users;"
|
||||
|
||||
# File backend: check the archive is valid
|
||||
tar tzf /backups/qpq-data-latest.tar.gz > /dev/null
|
||||
|
||||
# TLS cert: check it parses and is not expired
|
||||
openssl x509 -inform DER -in /backups/tls/server-cert.der -noout -dates
|
||||
```
|
||||
395
docs/operations/dashboards/qpq-overview.json
Normal file
395
docs/operations/dashboards/qpq-overview.json
Normal file
@@ -0,0 +1,395 @@
|
||||
{
|
||||
"__inputs": [
|
||||
{
|
||||
"name": "DS_PROMETHEUS",
|
||||
"label": "Prometheus",
|
||||
"description": "",
|
||||
"type": "datasource",
|
||||
"pluginId": "prometheus",
|
||||
"pluginName": "Prometheus"
|
||||
}
|
||||
],
|
||||
"__requires": [
|
||||
{
|
||||
"type": "grafana",
|
||||
"id": "grafana",
|
||||
"name": "Grafana",
|
||||
"version": "10.0.0"
|
||||
},
|
||||
{
|
||||
"type": "datasource",
|
||||
"id": "prometheus",
|
||||
"name": "Prometheus",
|
||||
"version": "1.0.0"
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "timeseries",
|
||||
"name": "Time series",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "stat",
|
||||
"name": "Stat",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "gauge",
|
||||
"name": "Gauge",
|
||||
"version": ""
|
||||
}
|
||||
],
|
||||
"id": null,
|
||||
"uid": "qpq-overview",
|
||||
"title": "quicproquo Server Overview",
|
||||
"description": "Operational dashboard for quicproquo server instances",
|
||||
"tags": ["quicproquo", "qpq"],
|
||||
"timezone": "browser",
|
||||
"editable": true,
|
||||
"graphTooltip": 1,
|
||||
"refresh": "10s",
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"title": "Server Status",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{job=\"qpq-server\"}",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" }, "1": { "text": "UP", "color": "green" } } }
|
||||
],
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "value": null, "color": "red" },
|
||||
{ "value": 1, "color": "green" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Enqueue Rate",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(enqueue_total[5m])",
|
||||
"legendFormat": "msgs/sec"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "value": null, "color": "green" },
|
||||
{ "value": 100, "color": "yellow" },
|
||||
{ "value": 500, "color": "red" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Fetch Rate",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(fetch_total[5m])",
|
||||
"legendFormat": "fetches/sec"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "value": null, "color": "green" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Auth Success Rate",
|
||||
"type": "gauge",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(auth_login_success_total[5m]) / (rate(auth_login_success_total[5m]) + rate(auth_login_failure_total[5m]))",
|
||||
"legendFormat": "success ratio"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit",
|
||||
"min": 0,
|
||||
"max": 1,
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "value": null, "color": "red" },
|
||||
{ "value": 0.5, "color": "yellow" },
|
||||
{ "value": 0.9, "color": "green" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Delivery Queue Depth",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "delivery_queue_depth",
|
||||
"legendFormat": "depth"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "value": null, "color": "green" },
|
||||
{ "value": 10000, "color": "yellow" },
|
||||
{ "value": 100000, "color": "red" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Rate Limit Hits",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 0 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(rate_limit_hit_total[5m])",
|
||||
"legendFormat": "hits/sec"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "value": null, "color": "green" },
|
||||
{ "value": 1, "color": "yellow" },
|
||||
{ "value": 10, "color": "red" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Message Throughput",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(enqueue_total[5m])",
|
||||
"legendFormat": "enqueue rate"
|
||||
},
|
||||
{
|
||||
"expr": "rate(fetch_total[5m])",
|
||||
"legendFormat": "fetch rate"
|
||||
},
|
||||
{
|
||||
"expr": "rate(fetch_wait_total[5m])",
|
||||
"legendFormat": "fetch_wait rate"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 10
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Enqueue Bandwidth",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(enqueue_bytes_total[5m])",
|
||||
"legendFormat": "bytes/sec"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "scheme"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Authentication",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(auth_login_success_total[5m])",
|
||||
"legendFormat": "success/sec"
|
||||
},
|
||||
{
|
||||
"expr": "rate(auth_login_failure_total[5m])",
|
||||
"legendFormat": "failure/sec"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 10
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "failure/sec" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Delivery Queue Depth Over Time",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "delivery_queue_depth",
|
||||
"legendFormat": "queue depth"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 20,
|
||||
"gradientMode": "scheme",
|
||||
"thresholdsStyle": { "mode": "area" }
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{ "value": null, "color": "green" },
|
||||
{ "value": 10000, "color": "yellow" },
|
||||
{ "value": 100000, "color": "red" }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Rate Limiting & Key Packages",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(rate_limit_hit_total[5m])",
|
||||
"legendFormat": "rate limit hits/sec"
|
||||
},
|
||||
{
|
||||
"expr": "rate(key_package_upload_total[5m])",
|
||||
"legendFormat": "key package uploads/sec"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 10
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "rate limit hits/sec" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Cumulative Totals",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "enqueue_total",
|
||||
"legendFormat": "total enqueued"
|
||||
},
|
||||
{
|
||||
"expr": "fetch_total",
|
||||
"legendFormat": "total fetched"
|
||||
},
|
||||
{
|
||||
"expr": "auth_login_success_total",
|
||||
"legendFormat": "total logins"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 5
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "datasource",
|
||||
"type": "datasource",
|
||||
"query": "prometheus",
|
||||
"current": {},
|
||||
"hide": 0
|
||||
}
|
||||
]
|
||||
},
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"name": "Deploys",
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"iconColor": "blue",
|
||||
"tags": ["deploy"]
|
||||
}
|
||||
]
|
||||
},
|
||||
"schemaVersion": 38,
|
||||
"version": 1
|
||||
}
|
||||
12
docs/operations/grafana-provisioning/dashboards/default.yml
Normal file
12
docs/operations/grafana-provisioning/dashboards/default.yml
Normal file
@@ -0,0 +1,12 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'default'
|
||||
orgId: 1
|
||||
folder: 'quicproquo'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
foldersFromFilesStructure: false
|
||||
@@ -0,0 +1,9 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
338
docs/operations/incident-response.md
Normal file
338
docs/operations/incident-response.md
Normal file
@@ -0,0 +1,338 @@
|
||||
# Incident Response Playbook
|
||||
|
||||
This document provides procedures for responding to common operational incidents in a quicproquo deployment.
|
||||
|
||||
## Severity Levels
|
||||
|
||||
| Level | Description | Response Time | Examples |
|
||||
|-------|-------------|---------------|----------|
|
||||
| P1 - Critical | Service down, data loss, key compromise | Immediate | Server crash loop, DB corruption, leaked secrets |
|
||||
| P2 - Major | Degraded service, partial outage | 15 minutes | High latency, storage full, cert expiry |
|
||||
| P3 - Minor | Non-critical issue, monitoring alert | 1 hour | Rate limit spikes, non-critical warnings |
|
||||
|
||||
## Incident: Server Not Starting
|
||||
|
||||
### Symptoms
|
||||
- Server process exits immediately
|
||||
- Logs show "TLS cert or key missing" or "production forbids" errors
|
||||
|
||||
### Diagnosis
|
||||
|
||||
```bash
|
||||
# Check server logs
|
||||
journalctl -u qpq-server --since "10 min ago" --no-pager
|
||||
|
||||
# Docker
|
||||
docker compose logs --tail=50 server
|
||||
```
|
||||
|
||||
### Common Causes and Fixes
|
||||
|
||||
**Missing TLS certificates (production mode)**
|
||||
```bash
|
||||
# Production requires pre-existing certs (no auto-generation)
|
||||
ls -la data/server-cert.der data/server-key.der
|
||||
|
||||
# If missing, restore from backup or generate new ones
|
||||
# See: key-rotation.md
|
||||
```
|
||||
|
||||
**Missing auth token (production mode)**
|
||||
```bash
|
||||
# Production requires QPQ_AUTH_TOKEN >= 16 chars, not "devtoken"
|
||||
echo $QPQ_AUTH_TOKEN | wc -c
|
||||
```
|
||||
|
||||
**Database locked or corrupt**
|
||||
```bash
|
||||
# Check if another process holds the database
|
||||
fuser data/qpq.db
|
||||
|
||||
# Verify database integrity
|
||||
sqlite3 data/qpq.db "PRAGMA key='${QPQ_DB_KEY}'; PRAGMA integrity_check;"
|
||||
```
|
||||
|
||||
**Port already in use**
|
||||
```bash
|
||||
# Check if something is already listening on port 7000
|
||||
ss -tlnp | grep 7000
|
||||
```
|
||||
|
||||
## Incident: Node Down / Unresponsive
|
||||
|
||||
### Symptoms
|
||||
- Clients cannot connect
|
||||
- Health check failures
|
||||
- No new log entries
|
||||
|
||||
### Diagnosis
|
||||
|
||||
```bash
|
||||
# 1. Check if the process is running
|
||||
systemctl status qpq-server
|
||||
# or: docker compose ps
|
||||
|
||||
# 2. Check resource usage
|
||||
top -bn1 | grep qpq
|
||||
df -h /var/lib/quicproquo
|
||||
free -h
|
||||
|
||||
# 3. Check QUIC port is reachable
|
||||
# From another host:
|
||||
nc -uzv <server-ip> 7000
|
||||
|
||||
# 4. Check for OOM kills
|
||||
dmesg | grep -i "out of memory\|oom" | tail -5
|
||||
journalctl -k | grep -i oom
|
||||
```
|
||||
|
||||
### Recovery
|
||||
|
||||
```bash
|
||||
# Restart the service
|
||||
systemctl restart qpq-server
|
||||
|
||||
# If OOM: increase memory limit
|
||||
systemctl edit qpq-server --force
|
||||
# MemoryMax=2G
|
||||
|
||||
# If disk full: see "Storage Full" incident below
|
||||
```
|
||||
|
||||
## Incident: Storage Full
|
||||
|
||||
### Symptoms
|
||||
- `enqueue` operations fail
|
||||
- Logs show "No space left on device"
|
||||
- `delivery_queue_depth` gauge rising
|
||||
|
||||
### Diagnosis
|
||||
|
||||
```bash
|
||||
# Check disk usage
|
||||
df -h /var/lib/quicproquo
|
||||
du -sh /var/lib/quicproquo/*
|
||||
|
||||
# Check largest files
|
||||
du -a /var/lib/quicproquo | sort -rn | head -20
|
||||
|
||||
# Check blob storage specifically
|
||||
du -sh /var/lib/quicproquo/blobs/
|
||||
find /var/lib/quicproquo/blobs/ -type f | wc -l
|
||||
```
|
||||
|
||||
### Recovery
|
||||
|
||||
```bash
|
||||
# 1. Identify and remove expired messages (the cleanup task handles this,
|
||||
# but if it's behind, you can trigger manual cleanup)
|
||||
|
||||
# For SQL backend: delete expired delivery messages
|
||||
sqlite3 data/qpq.db <<'EOF'
|
||||
PRAGMA key = '${QPQ_DB_KEY}';
|
||||
DELETE FROM delivery_queue WHERE expires_at IS NOT NULL AND expires_at < unixepoch();
|
||||
VACUUM;
|
||||
EOF
|
||||
|
||||
# 2. Remove orphaned blobs (blobs not referenced by any message)
|
||||
# This is application-specific; coordinate with the codebase
|
||||
|
||||
# 3. If the data partition is full, expand the volume
|
||||
# AWS EBS: aws ec2 modify-volume --volume-id vol-xxx --size 100
|
||||
# Then: resize2fs /dev/xvdf
|
||||
|
||||
# 4. Move to a larger disk
|
||||
systemctl stop qpq-server
|
||||
rsync -av /var/lib/quicproquo/ /mnt/new-volume/quicproquo/
|
||||
# Update QPQ_DATA_DIR and QPQ_DB_PATH to point to the new location
|
||||
systemctl start qpq-server
|
||||
```
|
||||
|
||||
### Prevention
|
||||
|
||||
- Set up disk usage alerts at 70% and 90% thresholds.
|
||||
- Configure message TTL (`ttl_secs`) to auto-expire old messages.
|
||||
- Schedule regular `VACUUM` on the SQLCipher database.
|
||||
|
||||
## Incident: DDoS / Connection Flood
|
||||
|
||||
### Symptoms
|
||||
- `rate_limit_hit_total` counter spiking
|
||||
- `auth_login_failure_total` counter spiking
|
||||
- High CPU usage
|
||||
- Legitimate clients cannot connect
|
||||
|
||||
### Diagnosis
|
||||
|
||||
```bash
|
||||
# Check connection rate limit hits in metrics
|
||||
curl -s http://localhost:9090/metrics | grep rate_limit
|
||||
|
||||
# Check auth failure rate
|
||||
curl -s http://localhost:9090/metrics | grep auth_login_failure
|
||||
|
||||
# Check active connections (QUIC uses UDP)
|
||||
ss -unp | grep 7000 | wc -l
|
||||
```
|
||||
|
||||
### Mitigation
|
||||
|
||||
```bash
|
||||
# 1. The server has built-in per-IP connection rate limiting.
|
||||
# Check the logs for "connection rate limit exceeded" messages.
|
||||
|
||||
# 2. Block offending IPs at the firewall level
|
||||
iptables -A INPUT -s <attacker-ip> -j DROP
|
||||
|
||||
# 3. For volumetric attacks, use upstream DDoS protection
|
||||
# (Cloudflare Spectrum, AWS Shield, etc.)
|
||||
|
||||
# 4. If the server is overwhelmed, restart to clear state
|
||||
systemctl restart qpq-server
|
||||
|
||||
# 5. Enable log redaction to reduce I/O pressure during attacks
|
||||
# Set QPQ_REDACT_LOGS=true
|
||||
```
|
||||
|
||||
## Incident: Key Compromise
|
||||
|
||||
### Auth Token Compromised
|
||||
|
||||
**Severity: P1**
|
||||
|
||||
```bash
|
||||
# 1. Immediately rotate the auth token
|
||||
NEW_TOKEN=$(openssl rand -base64 32)
|
||||
|
||||
# 2. Update server config and restart
|
||||
# See: key-rotation.md "Auth Token Rotation"
|
||||
|
||||
# 3. Notify all legitimate clients of the new token
|
||||
|
||||
# 4. Review logs for unauthorized access
|
||||
journalctl -u qpq-server | grep "auth_login_success" | tail -100
|
||||
```
|
||||
|
||||
### TLS Private Key Compromised
|
||||
|
||||
**Severity: P1**
|
||||
|
||||
```bash
|
||||
# 1. Generate and install a new certificate immediately
|
||||
# See: key-rotation.md "TLS Certificate Rotation"
|
||||
|
||||
# 2. Revoke the compromised certificate with your CA
|
||||
# (procedure depends on your CA)
|
||||
|
||||
# 3. Restart the server with the new certificate
|
||||
systemctl restart qpq-server
|
||||
|
||||
# 4. If clients pin certificates, notify them of the change
|
||||
```
|
||||
|
||||
### Database Key Compromised
|
||||
|
||||
**Severity: P1**
|
||||
|
||||
```bash
|
||||
# 1. Stop the server
|
||||
systemctl stop qpq-server
|
||||
|
||||
# 2. Rekey the database immediately
|
||||
# See: key-rotation.md "Database Encryption Key Rotation"
|
||||
|
||||
# 3. Assess data exposure
|
||||
# If the attacker had access to the database file, assume all
|
||||
# stored data (users, key packages, delivery queues) is compromised.
|
||||
|
||||
# 4. Consider notifying affected users
|
||||
```
|
||||
|
||||
### OPAQUE ServerSetup Compromised
|
||||
|
||||
**Severity: P1**
|
||||
|
||||
```bash
|
||||
# 1. Rotate the OPAQUE ServerSetup
|
||||
# See: key-rotation.md "OPAQUE ServerSetup Rotation"
|
||||
|
||||
# WARNING: This invalidates ALL OPAQUE credentials.
|
||||
# All users must re-register.
|
||||
|
||||
# 2. All users must re-register with new credentials
|
||||
# 3. Review logs for unauthorized OPAQUE authentications
|
||||
```
|
||||
|
||||
## Incident: High Latency
|
||||
|
||||
### Symptoms
|
||||
- Clients report slow message delivery
|
||||
- `delivery_queue_depth` gauge is high
|
||||
- Fetch operations are slow
|
||||
|
||||
### Diagnosis
|
||||
|
||||
```bash
|
||||
# 1. Check system resources
|
||||
top -bn1 | head -20
|
||||
iostat -x 1 3
|
||||
|
||||
# 2. Check database performance
|
||||
sqlite3 data/qpq.db <<'EOF'
|
||||
PRAGMA key = '${QPQ_DB_KEY}';
|
||||
PRAGMA integrity_check;
|
||||
PRAGMA wal_checkpoint(PASSIVE);
|
||||
-- Check table sizes
|
||||
SELECT 'delivery_queue', count(*) FROM delivery_queue
|
||||
UNION ALL SELECT 'key_packages', count(*) FROM key_packages
|
||||
UNION ALL SELECT 'users', count(*) FROM users;
|
||||
EOF
|
||||
|
||||
# 3. Check queue depth via metrics
|
||||
curl -s http://localhost:9090/metrics | grep delivery_queue_depth
|
||||
```
|
||||
|
||||
### Recovery
|
||||
|
||||
```bash
|
||||
# 1. Checkpoint the WAL (reduces WAL file size)
|
||||
sqlite3 data/qpq.db "PRAGMA key='${QPQ_DB_KEY}'; PRAGMA wal_checkpoint(TRUNCATE);"
|
||||
|
||||
# 2. VACUUM to reclaim space and defragment
|
||||
sqlite3 data/qpq.db "PRAGMA key='${QPQ_DB_KEY}'; VACUUM;"
|
||||
|
||||
# 3. If the queue is huge, check for clients not fetching
|
||||
# (delivery_queue rows accumulate when clients are offline)
|
||||
|
||||
# 4. If I/O-bound: move database to faster storage (SSD/NVMe)
|
||||
```
|
||||
|
||||
## Incident: Certificate Expiring
|
||||
|
||||
### Symptoms
|
||||
- Log warning: "TLS certificate expires within 30 days"
|
||||
- Monitoring alert on certificate expiry
|
||||
|
||||
### Response
|
||||
|
||||
```bash
|
||||
# 1. Check current certificate expiry
|
||||
openssl x509 -inform DER -in data/server-cert.der -noout -enddate
|
||||
|
||||
# 2. Renew the certificate
|
||||
# See: key-rotation.md "TLS Certificate Rotation"
|
||||
|
||||
# 3. Verify the new certificate is loaded
|
||||
journalctl -u qpq-server --since "1 min ago" | grep -i cert
|
||||
```
|
||||
|
||||
## Post-Incident Checklist
|
||||
|
||||
After resolving any incident:
|
||||
|
||||
1. **Document** the incident: timeline, root cause, resolution steps
|
||||
2. **Verify** the service is fully operational (check metrics, test client connections)
|
||||
3. **Review** whether monitoring would have caught this earlier
|
||||
4. **Update** alerts and thresholds based on findings
|
||||
5. **Communicate** with affected users if there was data exposure or service disruption
|
||||
6. **Schedule** follow-up actions (e.g., add monitoring, improve automation)
|
||||
250
docs/operations/key-rotation.md
Normal file
250
docs/operations/key-rotation.md
Normal file
@@ -0,0 +1,250 @@
|
||||
# Key Rotation Procedures
|
||||
|
||||
This document provides step-by-step procedures for rotating all cryptographic material in a quicproquo deployment.
|
||||
|
||||
## Auth Token Rotation
|
||||
|
||||
The auth token (`QPQ_AUTH_TOKEN`) is used for bearer-token authentication (auth version 1). OPAQUE-authenticated sessions are not affected by token rotation.
|
||||
|
||||
### Procedure
|
||||
|
||||
```bash
|
||||
# 1. Generate a new token (minimum 16 characters for production)
|
||||
NEW_TOKEN=$(openssl rand -base64 32)
|
||||
echo "New token: $NEW_TOKEN"
|
||||
|
||||
# 2. Update the config file or environment
|
||||
# Option A: TOML config file
|
||||
sed -i "s/^auth_token = .*/auth_token = \"$NEW_TOKEN\"/" qpq-server.toml
|
||||
|
||||
# Option B: Environment variable (systemd)
|
||||
systemctl edit qpq-server --force
|
||||
# Add: Environment=QPQ_AUTH_TOKEN=<new-token>
|
||||
|
||||
# Option C: Docker Compose
|
||||
# Update QPQ_AUTH_TOKEN in docker-compose.prod.yml or .env file
|
||||
|
||||
# 3. Restart the server
|
||||
systemctl restart qpq-server
|
||||
# or: docker compose restart server
|
||||
|
||||
# 4. Update all clients with the new token
|
||||
# Clients using OPAQUE auth are unaffected.
|
||||
# Clients using bearer-token auth must update their QPQ_ACCESS_TOKEN.
|
||||
```
|
||||
|
||||
### Impact
|
||||
|
||||
- Active bearer-token sessions continue until they expire (sessions are in-memory).
|
||||
- New bearer-token connections must use the new token.
|
||||
- OPAQUE-authenticated clients are not affected.
|
||||
|
||||
## TLS Certificate Rotation
|
||||
|
||||
The server uses DER-encoded X.509 certificates for QUIC TLS 1.3. The server validates certificates at startup and warns if expiry is within 30 days.
|
||||
|
||||
### Procedure
|
||||
|
||||
```bash
|
||||
# 1. Obtain a new certificate (example with Let's Encrypt / certbot)
|
||||
certbot certonly --standalone -d chat.example.com
|
||||
|
||||
# 2. Convert PEM to DER format (qpq-server expects DER)
|
||||
openssl x509 -in /etc/letsencrypt/live/chat.example.com/fullchain.pem \
|
||||
-outform DER -out /tmp/server-cert.der
|
||||
|
||||
openssl pkey -in /etc/letsencrypt/live/chat.example.com/privkey.pem \
|
||||
-outform DER -out /tmp/server-key.der
|
||||
|
||||
# 3. Set restrictive permissions on the private key
|
||||
chmod 600 /tmp/server-key.der
|
||||
|
||||
# 4. Back up the current certificates
|
||||
cp data/server-cert.der data/server-cert.der.bak
|
||||
cp data/server-key.der data/server-key.der.bak
|
||||
|
||||
# 5. Replace certificates
|
||||
cp /tmp/server-cert.der data/server-cert.der
|
||||
cp /tmp/server-key.der data/server-key.der
|
||||
|
||||
# 6. Verify the new certificate
|
||||
openssl x509 -inform DER -in data/server-cert.der -noout -text | head -20
|
||||
|
||||
# 7. Restart the server (QUIC requires restart for new TLS config)
|
||||
systemctl restart qpq-server
|
||||
|
||||
# 8. Verify the server started with the new certificate
|
||||
journalctl -u qpq-server --since "1 min ago" | grep -i tls
|
||||
```
|
||||
|
||||
### Self-Signed Certificate (Development)
|
||||
|
||||
In non-production mode, the server auto-generates a self-signed certificate if none exists. To force regeneration:
|
||||
|
||||
```bash
|
||||
rm data/server-cert.der data/server-key.der
|
||||
systemctl restart qpq-server
|
||||
# Server will generate a new self-signed cert for localhost/127.0.0.1/::1
|
||||
```
|
||||
|
||||
### Automated Renewal with Certbot
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# /opt/qpq/scripts/renew-cert.sh
|
||||
set -euo pipefail
|
||||
|
||||
DOMAIN="chat.example.com"
|
||||
CERT_DIR="/etc/letsencrypt/live/$DOMAIN"
|
||||
QPQ_DATA="/var/lib/quicproquo"
|
||||
|
||||
certbot renew --quiet
|
||||
|
||||
openssl x509 -in "$CERT_DIR/fullchain.pem" -outform DER -out "$QPQ_DATA/server-cert.der"
|
||||
openssl pkey -in "$CERT_DIR/privkey.pem" -outform DER -out "$QPQ_DATA/server-key.der"
|
||||
chmod 600 "$QPQ_DATA/server-key.der"
|
||||
chown qpq:qpq "$QPQ_DATA/server-cert.der" "$QPQ_DATA/server-key.der"
|
||||
|
||||
systemctl restart qpq-server
|
||||
```
|
||||
|
||||
```cron
|
||||
# Run cert renewal check twice daily
|
||||
0 3,15 * * * /opt/qpq/scripts/renew-cert.sh >> /var/log/qpq-cert-renew.log 2>&1
|
||||
```
|
||||
|
||||
## Federation Certificate Rotation
|
||||
|
||||
Federation uses mutual TLS (mTLS) with a shared CA for server-to-server authentication.
|
||||
|
||||
### Procedure
|
||||
|
||||
```bash
|
||||
# 1. Generate a new federation certificate signed by the federation CA
|
||||
openssl req -new -nodes -keyout /tmp/federation-key.pem \
|
||||
-out /tmp/federation.csr -subj "/CN=chat.example.com"
|
||||
|
||||
openssl x509 -req -in /tmp/federation.csr \
|
||||
-CA federation-ca.pem -CAkey federation-ca-key.pem \
|
||||
-CAcreateserial -days 365 -out /tmp/federation-cert.pem
|
||||
|
||||
# 2. Convert to DER
|
||||
openssl x509 -in /tmp/federation-cert.pem -outform DER -out data/federation-cert.der
|
||||
openssl pkey -in /tmp/federation-key.pem -outform DER -out data/federation-key.der
|
||||
chmod 600 data/federation-key.der
|
||||
|
||||
# 3. Restart the server
|
||||
systemctl restart qpq-server
|
||||
|
||||
# 4. Coordinate with federation peers: they must trust the same CA
|
||||
```
|
||||
|
||||
## Database Encryption Key Rotation
|
||||
|
||||
The SQLCipher database key (`QPQ_DB_KEY`) encrypts all data at rest.
|
||||
|
||||
### Procedure (SQLCipher PRAGMA rekey)
|
||||
|
||||
```bash
|
||||
# 1. Stop the server
|
||||
systemctl stop qpq-server
|
||||
|
||||
# 2. Back up the database
|
||||
cp data/qpq.db /backups/qpq-pre-rekey-$(date +%Y%m%d).db
|
||||
|
||||
# 3. Rekey the database
|
||||
sqlite3 data/qpq.db <<EOF
|
||||
PRAGMA key = 'old-encryption-key';
|
||||
PRAGMA rekey = 'new-encryption-key';
|
||||
EOF
|
||||
|
||||
# 4. Verify the database opens with the new key
|
||||
sqlite3 data/qpq.db "PRAGMA key = 'new-encryption-key'; PRAGMA integrity_check;"
|
||||
|
||||
# 5. Update the environment/config with the new key
|
||||
# Option A: systemd
|
||||
systemctl edit qpq-server --force
|
||||
# Environment=QPQ_DB_KEY=new-encryption-key
|
||||
|
||||
# Option B: Docker Compose .env
|
||||
echo "QPQ_DB_KEY=new-encryption-key" >> .env
|
||||
|
||||
# 6. Start the server
|
||||
systemctl start qpq-server
|
||||
```
|
||||
|
||||
### Full Re-encryption (Alternative)
|
||||
|
||||
If `PRAGMA rekey` is unavailable or you want a fresh database file:
|
||||
|
||||
```bash
|
||||
# 1. Stop the server and back up
|
||||
systemctl stop qpq-server
|
||||
cp data/qpq.db /backups/qpq-pre-rekey.db
|
||||
|
||||
# 2. Export with old key, import with new key
|
||||
sqlite3 data/qpq.db "PRAGMA key='old-key'; .dump" | \
|
||||
sqlite3 data/qpq-new.db "PRAGMA key='new-key'; .read /dev/stdin"
|
||||
|
||||
# 3. Replace the database
|
||||
mv data/qpq-new.db data/qpq.db
|
||||
|
||||
# 4. Update config and restart
|
||||
systemctl start qpq-server
|
||||
```
|
||||
|
||||
## OPAQUE ServerSetup Rotation
|
||||
|
||||
The OPAQUE ServerSetup is generated once and persisted. Rotating it invalidates all registered OPAQUE credentials.
|
||||
|
||||
**WARNING: Rotating the OPAQUE ServerSetup requires all users to re-register. Only do this if the setup is compromised.**
|
||||
|
||||
```bash
|
||||
# 1. Stop the server
|
||||
systemctl stop qpq-server
|
||||
|
||||
# 2. Back up the database
|
||||
cp data/qpq.db /backups/qpq-pre-opaque-rotate.db
|
||||
|
||||
# 3. Delete the persisted OPAQUE setup
|
||||
# For SQL backend:
|
||||
sqlite3 data/qpq.db "PRAGMA key='${QPQ_DB_KEY}'; DELETE FROM server_state WHERE key = 'opaque_setup';"
|
||||
|
||||
# For file backend:
|
||||
rm data/opaque_setup.bin 2>/dev/null || true
|
||||
|
||||
# 4. Start the server (it will generate a new OPAQUE ServerSetup)
|
||||
systemctl start qpq-server
|
||||
|
||||
# 5. All users must re-register (existing OPAQUE credentials are invalid)
|
||||
```
|
||||
|
||||
## Server Signing Key Rotation
|
||||
|
||||
The Ed25519 signing key is used for delivery proofs. Rotating it means old delivery proofs cannot be verified against the new key.
|
||||
|
||||
```bash
|
||||
# 1. Stop the server
|
||||
systemctl stop qpq-server
|
||||
|
||||
# 2. Back up
|
||||
cp data/qpq.db /backups/qpq-pre-sigkey-rotate.db
|
||||
|
||||
# 3. Delete the persisted signing key seed
|
||||
# For SQL backend:
|
||||
sqlite3 data/qpq.db "PRAGMA key='${QPQ_DB_KEY}'; DELETE FROM server_state WHERE key = 'signing_key_seed';"
|
||||
|
||||
# 4. Start the server (generates a new Ed25519 signing key)
|
||||
systemctl start qpq-server
|
||||
```
|
||||
|
||||
## Rotation Schedule
|
||||
|
||||
| Key Material | Rotation Frequency | Impact |
|
||||
|---|---|---|
|
||||
| Auth token | Quarterly or on compromise | Clients using bearer auth must update |
|
||||
| TLS certificate | Before expiry (automate with certbot) | Server restart required |
|
||||
| Federation cert | Annually or before expiry | Coordinate with peers |
|
||||
| DB encryption key | Annually or on compromise | Server downtime required |
|
||||
| OPAQUE ServerSetup | Only on compromise | All users must re-register |
|
||||
| Server signing key | Only on compromise | Old delivery proofs unverifiable |
|
||||
225
docs/operations/monitoring.md
Normal file
225
docs/operations/monitoring.md
Normal file
@@ -0,0 +1,225 @@
|
||||
# Monitoring Guide
|
||||
|
||||
This document covers metrics collection, alerting, and dashboards for quicproquo.
|
||||
|
||||
## Enabling Metrics
|
||||
|
||||
The server exports Prometheus metrics via HTTP when configured:
|
||||
|
||||
```bash
|
||||
# Environment variables
|
||||
QPQ_METRICS_LISTEN=0.0.0.0:9090
|
||||
QPQ_METRICS_ENABLED=true
|
||||
|
||||
# Or in qpq-server.toml
|
||||
metrics_listen = "0.0.0.0:9090"
|
||||
metrics_enabled = true
|
||||
```
|
||||
|
||||
Metrics are served at `http://<metrics_listen>/metrics` in Prometheus exposition format.
|
||||
|
||||
## Available Metrics
|
||||
|
||||
### Counters
|
||||
|
||||
| Metric | Description | Labels |
|
||||
|--------|-------------|--------|
|
||||
| `enqueue_total` | Total messages enqueued | - |
|
||||
| `enqueue_bytes_total` | Total bytes enqueued | - |
|
||||
| `fetch_total` | Total message fetches completed | - |
|
||||
| `fetch_wait_total` | Total long-poll fetch waits | - |
|
||||
| `key_package_upload_total` | Total MLS key package uploads | - |
|
||||
| `auth_login_success_total` | Successful OPAQUE login completions | - |
|
||||
| `auth_login_failure_total` | Failed login attempts | - |
|
||||
| `rate_limit_hit_total` | Rate limit rejections | - |
|
||||
|
||||
### Gauges
|
||||
|
||||
| Metric | Description |
|
||||
|--------|-------------|
|
||||
| `delivery_queue_depth` | Current delivery queue depth (sampled) |
|
||||
|
||||
## Prometheus Configuration
|
||||
|
||||
```yaml
|
||||
# prometheus.yml
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'qpq-server'
|
||||
static_targets:
|
||||
- targets: ['qpq-server:9090']
|
||||
scrape_interval: 10s
|
||||
```
|
||||
|
||||
## Alert Rules
|
||||
|
||||
```yaml
|
||||
# prometheus-alerts.yml
|
||||
groups:
|
||||
- name: qpq-server
|
||||
rules:
|
||||
# Server down
|
||||
- alert: QpqServerDown
|
||||
expr: up{job="qpq-server"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "qpq-server is down"
|
||||
description: "Prometheus cannot scrape qpq-server metrics for > 1 minute."
|
||||
|
||||
# High auth failure rate (potential brute force)
|
||||
- alert: QpqHighAuthFailureRate
|
||||
expr: rate(auth_login_failure_total[5m]) > 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High authentication failure rate"
|
||||
description: "{{ $value | printf \"%.1f\" }} auth failures/sec over 5 minutes."
|
||||
|
||||
# Rate limiting active
|
||||
- alert: QpqRateLimitActive
|
||||
expr: rate(rate_limit_hit_total[5m]) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Rate limiting is actively rejecting requests"
|
||||
description: "{{ $value | printf \"%.1f\" }} rate limit hits/sec."
|
||||
|
||||
# Delivery queue growing
|
||||
- alert: QpqDeliveryQueueHigh
|
||||
expr: delivery_queue_depth > 10000
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Delivery queue depth is high"
|
||||
description: "Queue depth: {{ $value }}. Clients may not be fetching."
|
||||
|
||||
- alert: QpqDeliveryQueueCritical
|
||||
expr: delivery_queue_depth > 100000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Delivery queue depth is critical"
|
||||
description: "Queue depth: {{ $value }}. Investigate immediately."
|
||||
|
||||
# No enqueue activity (service may be stuck)
|
||||
- alert: QpqNoEnqueueActivity
|
||||
expr: rate(enqueue_total[15m]) == 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "No messages enqueued in 30 minutes"
|
||||
description: "Check if the service is accepting connections."
|
||||
|
||||
# Auth success ratio too low
|
||||
- alert: QpqLowAuthSuccessRatio
|
||||
expr: >
|
||||
rate(auth_login_success_total[5m])
|
||||
/ (rate(auth_login_success_total[5m]) + rate(auth_login_failure_total[5m]))
|
||||
< 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Auth success ratio below 50%"
|
||||
description: "More than half of login attempts are failing."
|
||||
```
|
||||
|
||||
## Key Dashboard Panels
|
||||
|
||||
See `dashboards/qpq-overview.json` for the full Grafana dashboard. Key panels:
|
||||
|
||||
### Message Throughput
|
||||
- **Enqueue rate**: `rate(enqueue_total[5m])`
|
||||
- **Fetch rate**: `rate(fetch_total[5m])`
|
||||
- **Enqueue bandwidth**: `rate(enqueue_bytes_total[5m])`
|
||||
|
||||
### Authentication
|
||||
- **Login success rate**: `rate(auth_login_success_total[5m])`
|
||||
- **Login failure rate**: `rate(auth_login_failure_total[5m])`
|
||||
- **Success ratio**: `rate(auth_login_success_total[5m]) / (rate(auth_login_success_total[5m]) + rate(auth_login_failure_total[5m]))`
|
||||
|
||||
### Delivery Queue
|
||||
- **Queue depth**: `delivery_queue_depth`
|
||||
- **Queue growth rate**: `deriv(delivery_queue_depth[10m])`
|
||||
|
||||
### Rate Limiting
|
||||
- **Rate limit hits**: `rate(rate_limit_hit_total[5m])`
|
||||
|
||||
### Infrastructure (Node Exporter)
|
||||
- CPU, memory, disk, network from `node_exporter`
|
||||
|
||||
## Grafana Dashboard
|
||||
|
||||
Import the dashboard from `dashboards/qpq-overview.json`:
|
||||
|
||||
1. Open Grafana -> Dashboards -> Import
|
||||
2. Upload `docs/operations/dashboards/qpq-overview.json`
|
||||
3. Select your Prometheus data source
|
||||
4. Save
|
||||
|
||||
## Log Monitoring
|
||||
|
||||
The server uses `tracing` with `RUST_LOG` environment variable:
|
||||
|
||||
```bash
|
||||
# Production: info level with structured JSON output
|
||||
RUST_LOG=info
|
||||
|
||||
# Debug specific modules
|
||||
RUST_LOG=info,quicproquo_server::node_service=debug
|
||||
|
||||
# Verbose debugging
|
||||
RUST_LOG=debug
|
||||
```
|
||||
|
||||
### Key Log Messages to Monitor
|
||||
|
||||
| Log Pattern | Meaning | Action |
|
||||
|-------------|---------|--------|
|
||||
| `"TLS certificate expires within 30 days"` | Cert expiring soon | Rotate certificate |
|
||||
| `"TLS certificate is self-signed"` | Self-signed cert in use | Replace with CA-signed cert in production |
|
||||
| `"connection rate limit exceeded"` | IP being rate limited | Check for DDoS |
|
||||
| `"running without QPQ_AUTH_TOKEN"` | Insecure mode | Must not appear in production |
|
||||
| `"db_key is empty; SQL store will be plaintext"` | Unencrypted DB | Must not appear in production |
|
||||
| `"shutdown signal received"` | Graceful shutdown started | Expected during deploys |
|
||||
| `"generated and persisted new OPAQUE ServerSetup"` | Fresh OPAQUE setup | Expected on first start only |
|
||||
|
||||
### Log Aggregation
|
||||
|
||||
For production, pipe logs to a log aggregator:
|
||||
|
||||
```bash
|
||||
# Systemd -> journald -> Loki/Elasticsearch
|
||||
journalctl -u qpq-server -f --output=json | \
|
||||
promtail --stdin --client.url=http://loki:3100/loki/api/v1/push
|
||||
|
||||
# Docker -> Loki driver
|
||||
docker run --log-driver=loki \
|
||||
--log-opt loki-url="http://loki:3100/loki/api/v1/push" \
|
||||
qpq-server
|
||||
```
|
||||
|
||||
## Health Checking
|
||||
|
||||
The Docker image includes a basic health check (TLS cert file exists). For deeper health checks:
|
||||
|
||||
```bash
|
||||
# Simple: check the process is running and port is open
|
||||
ss -ulnp | grep 7000
|
||||
|
||||
# Metrics endpoint (if enabled)
|
||||
curl -sf http://localhost:9090/metrics > /dev/null
|
||||
|
||||
# Full client connection test
|
||||
qpq-client --server 127.0.0.1:7000 --auth-token "$TOKEN" --ping
|
||||
```
|
||||
55
docs/operations/prometheus-alerts.yml
Normal file
55
docs/operations/prometheus-alerts.yml
Normal file
@@ -0,0 +1,55 @@
|
||||
groups:
|
||||
- name: qpq-server
|
||||
rules:
|
||||
- alert: QpqServerDown
|
||||
expr: up{job="qpq-server"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "qpq-server is down"
|
||||
description: "Prometheus cannot scrape qpq-server metrics for > 1 minute."
|
||||
|
||||
- alert: QpqHighAuthFailureRate
|
||||
expr: rate(auth_login_failure_total[5m]) > 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High authentication failure rate"
|
||||
description: "{{ $value | printf \"%.1f\" }} auth failures/sec over 5 minutes."
|
||||
|
||||
- alert: QpqRateLimitActive
|
||||
expr: rate(rate_limit_hit_total[5m]) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Rate limiting is actively rejecting requests"
|
||||
|
||||
- alert: QpqDeliveryQueueHigh
|
||||
expr: delivery_queue_depth > 10000
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Delivery queue depth is high ({{ $value }})"
|
||||
|
||||
- alert: QpqDeliveryQueueCritical
|
||||
expr: delivery_queue_depth > 100000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Delivery queue depth is critical ({{ $value }})"
|
||||
|
||||
- alert: QpqLowAuthSuccessRatio
|
||||
expr: >
|
||||
rate(auth_login_success_total[5m])
|
||||
/ (rate(auth_login_success_total[5m]) + rate(auth_login_failure_total[5m]))
|
||||
< 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Auth success ratio below 50%"
|
||||
12
docs/operations/prometheus.yml
Normal file
12
docs/operations/prometheus.yml
Normal file
@@ -0,0 +1,12 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
rule_files:
|
||||
- "alerts.yml"
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'qpq-server'
|
||||
static_configs:
|
||||
- targets: ['server:9090']
|
||||
scrape_interval: 10s
|
||||
244
docs/operations/scaling-guide.md
Normal file
244
docs/operations/scaling-guide.md
Normal file
@@ -0,0 +1,244 @@
|
||||
# Scaling Guide
|
||||
|
||||
This document covers resource sizing, scaling triggers, and capacity planning for quicproquo deployments.
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
quicproquo runs as a single-process server handling QUIC connections. Key resource consumers:
|
||||
|
||||
- **CPU**: TLS 1.3 handshakes (QUIC), OPAQUE PAKE authentication, message routing
|
||||
- **Memory**: In-memory session state (DashMap), QUIC connection state, delivery waiters, rate limit entries
|
||||
- **Disk I/O**: SQLCipher reads/writes (WAL mode), blob storage, KT Merkle log
|
||||
- **Network**: QUIC (UDP), metrics HTTP, optional WebSocket bridge
|
||||
|
||||
## Single-Node Sizing
|
||||
|
||||
### Minimum (Development / Small Team)
|
||||
|
||||
| Resource | Value |
|
||||
|----------|-------|
|
||||
| CPU | 1 vCPU |
|
||||
| Memory | 512 MB |
|
||||
| Disk | 10 GB SSD |
|
||||
| Network | 100 Mbps |
|
||||
|
||||
Supports ~100 concurrent users, light message traffic.
|
||||
|
||||
### Recommended (Production / Small-Medium)
|
||||
|
||||
| Resource | Value |
|
||||
|----------|-------|
|
||||
| CPU | 2-4 vCPU |
|
||||
| Memory | 2-4 GB |
|
||||
| Disk | 50-100 GB NVMe SSD |
|
||||
| Network | 1 Gbps |
|
||||
|
||||
Supports ~1,000-5,000 concurrent users.
|
||||
|
||||
### Large (High Traffic)
|
||||
|
||||
| Resource | Value |
|
||||
|----------|-------|
|
||||
| CPU | 8+ vCPU |
|
||||
| Memory | 8-16 GB |
|
||||
| Disk | 500 GB+ NVMe SSD (RAID 10) |
|
||||
| Network | 10 Gbps |
|
||||
|
||||
Supports ~10,000+ concurrent users.
|
||||
|
||||
## Scaling Triggers
|
||||
|
||||
Monitor these metrics and scale when thresholds are exceeded:
|
||||
|
||||
| Metric | Warning | Critical | Action |
|
||||
|--------|---------|----------|--------|
|
||||
| CPU usage | > 70% sustained (5 min) | > 90% sustained | Add CPU or scale horizontally |
|
||||
| Memory usage | > 75% | > 90% | Increase memory, check for leaks |
|
||||
| Disk usage | > 70% | > 90% | Expand volume, clean old data |
|
||||
| Disk I/O latency | > 5 ms p95 | > 20 ms p95 | Move to faster storage |
|
||||
| `delivery_queue_depth` | > 10,000 | > 100,000 | Investigate stale queues |
|
||||
| `rate_limit_hit_total` rate | > 100/min | > 1000/min | Investigate abuse, adjust limits |
|
||||
| `auth_login_failure_total` rate | > 50/min | > 500/min | Potential brute force attack |
|
||||
| Connection count | > 80% of `max_concurrent_bidi_streams` | > 95% | Scale horizontally |
|
||||
| TLS handshake latency | > 100 ms p95 | > 500 ms p95 | Add CPU, check network |
|
||||
|
||||
## Vertical Scaling
|
||||
|
||||
### CPU Scaling
|
||||
|
||||
The server is async (Tokio) and benefits from multiple cores. QUIC TLS handshakes and OPAQUE computations are CPU-intensive.
|
||||
|
||||
```bash
|
||||
# Check current CPU usage
|
||||
top -bn1 -p $(pgrep qpq-server)
|
||||
|
||||
# For Docker: increase CPU limits
|
||||
# docker-compose.prod.yml:
|
||||
# deploy:
|
||||
# resources:
|
||||
# limits:
|
||||
# cpus: '4'
|
||||
```
|
||||
|
||||
### Memory Scaling
|
||||
|
||||
In-memory state scales linearly with concurrent connections:
|
||||
- ~2-5 KB per active QUIC connection (quinn state)
|
||||
- ~200 bytes per session entry (DashMap)
|
||||
- ~100 bytes per rate limit entry
|
||||
- ~100 bytes per delivery waiter
|
||||
|
||||
```bash
|
||||
# Estimate memory for 10,000 connections:
|
||||
# 10,000 * 5 KB = ~50 MB for connections
|
||||
# 10,000 * 500 bytes = ~5 MB for sessions/rate limits
|
||||
# SQLCipher connection pool: ~50 MB (4 connections, caches)
|
||||
# Base process: ~30 MB
|
||||
# Total: ~135 MB + headroom = 256-512 MB minimum
|
||||
```
|
||||
|
||||
### Disk I/O Scaling
|
||||
|
||||
SQLCipher uses WAL mode for concurrent reads. For write-heavy workloads:
|
||||
|
||||
```bash
|
||||
# Check current I/O
|
||||
iostat -x 1 5
|
||||
|
||||
# Move to NVMe if on spinning disk
|
||||
# Increase WAL autocheckpoint threshold for burst writes
|
||||
sqlite3 data/qpq.db "PRAGMA key='${QPQ_DB_KEY}'; PRAGMA wal_autocheckpoint=2000;"
|
||||
```
|
||||
|
||||
## Horizontal Scaling
|
||||
|
||||
quicproquo does not yet have built-in multi-node clustering. For horizontal scaling, use these patterns:
|
||||
|
||||
### Load Balancer (UDP/QUIC)
|
||||
|
||||
Place a UDP load balancer in front of multiple qpq-server instances. Each instance runs independently with its own database.
|
||||
|
||||
```
|
||||
+-----------+
|
||||
clients ------> | L4 LB | ----> qpq-server-1 (db-1)
|
||||
| (UDP/QUIC)| ----> qpq-server-2 (db-2)
|
||||
+-----------+ qpq-server-3 (db-3)
|
||||
```
|
||||
|
||||
**Requirements:**
|
||||
- Sticky sessions (by client IP or QUIC connection ID) so a client always reaches the same node
|
||||
- Shared storage backend or federation between nodes
|
||||
|
||||
### Federation for Multi-Node
|
||||
|
||||
Enable federation to relay messages between nodes:
|
||||
|
||||
```toml
|
||||
# qpq-server.toml on node-1
|
||||
[federation]
|
||||
enabled = true
|
||||
domain = "node1.chat.example.com"
|
||||
listen = "0.0.0.0:7001"
|
||||
federation_cert = "data/federation-cert.der"
|
||||
federation_key = "data/federation-key.der"
|
||||
federation_ca = "data/federation-ca.der"
|
||||
|
||||
[[federation.peers]]
|
||||
domain = "node2.chat.example.com"
|
||||
address = "10.0.1.2:7001"
|
||||
```
|
||||
|
||||
### Shared Database (PostgreSQL)
|
||||
|
||||
For true horizontal scaling, migrate from SQLCipher to a shared PostgreSQL instance. This is not yet implemented but is the planned approach for multi-node deployments.
|
||||
|
||||
```
|
||||
qpq-server-1 --\
|
||||
qpq-server-2 ---+--> PostgreSQL (shared)
|
||||
qpq-server-3 --/
|
||||
```
|
||||
|
||||
## Connection Tuning
|
||||
|
||||
The server has these QUIC transport defaults:
|
||||
|
||||
| Parameter | Default | Tunable |
|
||||
|-----------|---------|---------|
|
||||
| Max idle timeout | 300s (5 min) | Code change required |
|
||||
| Max concurrent bidi streams | 1 per connection | Code change required |
|
||||
| Max concurrent uni streams | 0 | Code change required |
|
||||
| SQLCipher connection pool | 4 connections | Code change required |
|
||||
|
||||
For high connection counts, consider:
|
||||
- Increasing the OS file descriptor limit: `ulimit -n 65536`
|
||||
- Increasing UDP buffer sizes:
|
||||
|
||||
```bash
|
||||
# /etc/sysctl.d/99-qpq.conf
|
||||
net.core.rmem_max = 26214400
|
||||
net.core.wmem_max = 26214400
|
||||
net.core.rmem_default = 1048576
|
||||
net.core.wmem_default = 1048576
|
||||
```
|
||||
|
||||
```bash
|
||||
sysctl -p /etc/sysctl.d/99-qpq.conf
|
||||
```
|
||||
|
||||
## Docker Resource Limits
|
||||
|
||||
```yaml
|
||||
# docker-compose.prod.yml
|
||||
services:
|
||||
server:
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: '4'
|
||||
memory: 4G
|
||||
reservations:
|
||||
cpus: '2'
|
||||
memory: 1G
|
||||
ulimits:
|
||||
nofile:
|
||||
soft: 65536
|
||||
hard: 65536
|
||||
```
|
||||
|
||||
## Load Testing
|
||||
|
||||
Use the included test infrastructure to benchmark:
|
||||
|
||||
```bash
|
||||
# Build the test client
|
||||
cargo build --release --bin qpq-client
|
||||
|
||||
# Run concurrent connection test (example)
|
||||
for i in $(seq 1 100); do
|
||||
qpq-client --server 127.0.0.1:7000 --auth-token "$QPQ_AUTH_TOKEN" &
|
||||
done
|
||||
wait
|
||||
|
||||
# Monitor during load test
|
||||
watch -n1 'curl -s http://localhost:9090/metrics | grep -E "enqueue_total|fetch_total|delivery_queue_depth|rate_limit"'
|
||||
```
|
||||
|
||||
## Capacity Planning Worksheet
|
||||
|
||||
| Parameter | Your Value |
|
||||
|-----------|-----------|
|
||||
| Expected concurrent users | |
|
||||
| Messages per user per hour | |
|
||||
| Average message size (bytes) | |
|
||||
| Blob uploads per day | |
|
||||
| Average blob size (MB) | |
|
||||
| Data retention (days) | |
|
||||
|
||||
**Formulas:**
|
||||
|
||||
```
|
||||
Storage per day = (users * msgs/hr * 24 * avg_msg_size) + (blob_uploads * avg_blob_size)
|
||||
DB growth per month = storage_per_day * 30
|
||||
Memory estimate = (concurrent_users * 5 KB) + 256 MB base
|
||||
CPU estimate = 1 vCPU per ~2,500 concurrent connections (depends on message rate)
|
||||
```
|
||||
Reference in New Issue
Block a user