feat: add graceful shutdown with drain timeout and per-RPC timeouts

Graceful shutdown (Phase 6.4):
- Listen for SIGTERM + SIGINT via tokio::signal
- Configurable drain timeout (--drain-timeout / QPQ_DRAIN_TIMEOUT, default 30s)
- Health endpoint returns "draining" during shutdown for load balancer awareness
- ServerState carries atomic draining flag
- Add RpcStatus::Unavailable (9) for shutdown-related rejections

Per-RPC timeouts (Phase 6.5):
- Add RpcStatus::DeadlineExceeded (8) for server-side timeouts
- MethodRegistry supports default_timeout and per-method timeout overrides
- RPC dispatch wraps handler invocation with tokio::time::timeout
- RequestContext carries optional deadline (Instant) for handlers
- Health: 5s timeout, blob upload/download: 120s timeout, default: 30s
- Config: --rpc-timeout / QPQ_RPC_TIMEOUT, --storage-timeout / QPQ_STORAGE_TIMEOUT
This commit is contained in:
2026-03-04 20:33:26 +01:00
parent 91c5495ab7
commit e93a38243f
10 changed files with 545 additions and 26 deletions

View File

@@ -1,6 +1,7 @@
//! v2 RPC handler dispatch — protobuf in, domain logic, protobuf out.
use std::path::PathBuf;
use std::sync::atomic::AtomicBool;
use std::sync::Arc;
use dashmap::DashMap;
@@ -11,6 +12,7 @@ use quicproquo_rpc::error::RpcStatus;
use quicproquo_rpc::method::{HandlerResult, MethodRegistry, RequestContext};
use tokio::sync::Notify;
use crate::audit::AuditLogger;
use crate::auth::{AuthConfig, PendingLogin, RateEntry, SessionInfo};
use crate::hooks::ServerHooks;
use crate::storage::Store;
@@ -44,6 +46,11 @@ pub struct ServerState {
pub kt_log: Arc<std::sync::Mutex<quicproquo_kt::MerkleLog>>,
pub data_dir: PathBuf,
pub redact_logs: bool,
/// Structured audit logger for security-relevant events.
pub audit_logger: Arc<dyn AuditLogger>,
/// When true, the server is draining and will reject new work.
/// Health endpoint returns "draining" status so load balancers stop routing.
pub draining: Arc<AtomicBool>,
/// Idempotency dedup: message_id -> (seq, timestamp). TTL-cleaned by cleanup task.
pub seen_message_ids: Arc<DashMap<Vec<u8>, (u64, u64)>>,
/// Banned users: identity_key -> BanRecord.
@@ -154,9 +161,13 @@ pub fn domain_err(e: crate::domain::types::DomainError) -> HandlerResult {
}
}
/// Build the v2 method registry with all 33 handlers registered.
pub fn build_registry() -> MethodRegistry<ServerState> {
/// Build the v2 method registry with all handlers registered.
///
/// `default_rpc_timeout` sets the server-wide per-RPC timeout. Individual methods
/// (e.g. blob upload, health) may override this with shorter or longer values.
pub fn build_registry(default_rpc_timeout: std::time::Duration) -> MethodRegistry<ServerState> {
let mut reg = MethodRegistry::new();
reg.set_default_timeout(default_rpc_timeout);
// Auth (100-103)
reg.register(
@@ -264,15 +275,17 @@ pub fn build_registry() -> MethodRegistry<ServerState> {
user::handle_resolve_identity,
);
// Blob (600-601)
reg.register(
// Blob (600-601) — longer timeout for file transfers.
reg.register_with_timeout(
method_ids::UPLOAD_BLOB,
"UploadBlob",
std::time::Duration::from_secs(120),
blob::handle_upload_blob,
);
reg.register(
reg.register_with_timeout(
method_ids::DOWNLOAD_BLOB,
"DownloadBlob",
std::time::Duration::from_secs(120),
blob::handle_download_blob,
);
@@ -304,7 +317,12 @@ pub fn build_registry() -> MethodRegistry<ServerState> {
"ResolveEndpoint",
p2p::handle_resolve_endpoint,
);
reg.register(method_ids::HEALTH, "Health", p2p::handle_health);
reg.register_with_timeout(
method_ids::HEALTH,
"Health",
std::time::Duration::from_secs(5),
p2p::handle_health,
);
// Federation (900-905)
reg.register(