feat: add graceful shutdown with drain timeout and per-RPC timeouts

Graceful shutdown (Phase 6.4):
- Listen for SIGTERM + SIGINT via tokio::signal
- Configurable drain timeout (--drain-timeout / QPQ_DRAIN_TIMEOUT, default 30s)
- Health endpoint returns "draining" during shutdown for load balancer awareness
- ServerState carries atomic draining flag
- Add RpcStatus::Unavailable (9) for shutdown-related rejections

Per-RPC timeouts (Phase 6.5):
- Add RpcStatus::DeadlineExceeded (8) for server-side timeouts
- MethodRegistry supports default_timeout and per-method timeout overrides
- RPC dispatch wraps handler invocation with tokio::time::timeout
- RequestContext carries optional deadline (Instant) for handlers
- Health: 5s timeout, blob upload/download: 120s timeout, default: 30s
- Config: --rpc-timeout / QPQ_RPC_TIMEOUT, --storage-timeout / QPQ_STORAGE_TIMEOUT
This commit is contained in:
2026-03-04 20:33:26 +01:00
parent 91c5495ab7
commit e93a38243f
10 changed files with 545 additions and 26 deletions

View File

@@ -113,6 +113,9 @@ async fn handle_connection<S: Send + Sync + 'static>(
let remote = connection.remote_address();
debug!(remote = %remote, "new connection");
metrics::gauge!("rpc_active_connections").increment(1.0);
metrics::counter!("rpc_connections_total").increment(1);
// Perform auth handshake on the first bi-stream.
let conn_state = {
let (mut send, mut recv) = connection
@@ -136,7 +139,7 @@ async fn handle_connection<S: Send + Sync + 'static>(
};
// Accept RPC streams.
loop {
let result = loop {
let stream = connection.accept_bi().await;
match stream {
Ok((send, recv)) => {
@@ -153,16 +156,17 @@ async fn handle_connection<S: Send + Sync + 'static>(
}
Err(quinn::ConnectionError::ApplicationClosed(_)) => {
debug!(remote = %remote, "connection closed by peer");
break;
break Ok(());
}
Err(e) => {
debug!(remote = %remote, "accept_bi error: {e}");
break;
break Ok(());
}
}
}
};
Ok(())
metrics::gauge!("rpc_active_connections").decrement(1.0);
result
}
/// Handle a single bi-directional stream: read request, dispatch, write response.
@@ -194,18 +198,57 @@ async fn handle_stream<S: Send + Sync + 'static>(
None => return Err(RpcError::Decode("incomplete request frame".into())),
};
let trace_id = uuid::Uuid::now_v7().to_string();
let result = match registry.get(frame.method_id) {
Some((handler, name)) => {
debug!(method_id = frame.method_id, method = name, req_id = frame.request_id, "dispatching");
Some((handler, name, timeout)) => {
let span = tracing::info_span!(
"rpc",
trace_id = %trace_id,
method_id = frame.method_id,
method = name,
req_id = frame.request_id,
);
let _guard = span.enter();
debug!("dispatching");
let deadline = timeout.map(|d| tokio::time::Instant::now() + d);
let start = std::time::Instant::now();
let ctx = RequestContext {
identity_key: conn_state.identity_key.clone(),
session_token: conn_state.session_token.clone(),
payload: frame.payload,
trace_id: trace_id.clone(),
deadline,
};
handler(Arc::clone(&state), ctx).await
let result = if let Some(dur) = timeout {
match tokio::time::timeout(dur, handler(Arc::clone(&state), ctx)).await {
Ok(r) => r,
Err(_) => {
warn!(method = name, timeout_ms = dur.as_millis() as u64, "request deadline exceeded");
HandlerResult::err(RpcStatus::DeadlineExceeded, "request deadline exceeded")
}
}
} else {
handler(Arc::clone(&state), ctx).await
};
let elapsed = start.elapsed();
// Per-endpoint latency histogram.
metrics::histogram!("rpc_request_duration_seconds", "method" => name)
.record(elapsed.as_secs_f64());
metrics::counter!("rpc_requests_total", "method" => name, "status" => status_label(result.status))
.increment(1);
result
}
None => {
warn!(method_id = frame.method_id, "unknown method");
warn!(method_id = frame.method_id, trace_id = %trace_id, "unknown method");
metrics::counter!("rpc_requests_total", "method" => "unknown", "status" => "unknown_method")
.increment(1);
HandlerResult::err(RpcStatus::UnknownMethod, "unknown method")
}
};
@@ -225,6 +268,22 @@ async fn handle_stream<S: Send + Sync + 'static>(
Ok(())
}
/// Convert an RpcStatus to a short label for metrics.
fn status_label(status: RpcStatus) -> &'static str {
match status {
RpcStatus::Ok => "ok",
RpcStatus::BadRequest => "bad_request",
RpcStatus::Unauthorized => "unauthorized",
RpcStatus::Forbidden => "forbidden",
RpcStatus::NotFound => "not_found",
RpcStatus::RateLimited => "rate_limited",
RpcStatus::DeadlineExceeded => "deadline_exceeded",
RpcStatus::Unavailable => "unavailable",
RpcStatus::Internal => "internal",
RpcStatus::UnknownMethod => "unknown_method",
}
}
/// Send a push event to a client via a QUIC uni-stream.
pub async fn send_push(
connection: &quinn::Connection,