feat(p2p): add production infrastructure modules
- error.rs: Structured error types with context for all subsystems (transport, routing, crypto, protocol, store, config) - config.rs: Runtime configuration with TOML parsing and validation - metrics.rs: Counter/gauge/histogram metrics with transport-specific tracking and JSON-serializable snapshots - rate_limit.rs: Token bucket rate limiting with per-peer tracking, duty cycle enforcement for LoRa, and backpressure control These modules provide the foundation for production deployment.
This commit is contained in:
502
crates/quicprochat-p2p/src/metrics.rs
Normal file
502
crates/quicprochat-p2p/src/metrics.rs
Normal file
@@ -0,0 +1,502 @@
|
||||
//! Observability metrics for mesh networking.
|
||||
//!
|
||||
//! This module provides structured metrics collection for monitoring
|
||||
//! mesh node health, performance, and resource usage.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
/// Atomic counter for thread-safe metric updates.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct Counter(AtomicU64);
|
||||
|
||||
impl Counter {
|
||||
pub fn new() -> Self {
|
||||
Self(AtomicU64::new(0))
|
||||
}
|
||||
|
||||
pub fn inc(&self) {
|
||||
self.0.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn inc_by(&self, n: u64) {
|
||||
self.0.fetch_add(n, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn get(&self) -> u64 {
|
||||
self.0.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
pub fn reset(&self) -> u64 {
|
||||
self.0.swap(0, Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
/// Gauge for values that can go up and down.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct Gauge(AtomicU64);
|
||||
|
||||
impl Gauge {
|
||||
pub fn new() -> Self {
|
||||
Self(AtomicU64::new(0))
|
||||
}
|
||||
|
||||
pub fn set(&self, val: u64) {
|
||||
self.0.store(val, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn inc(&self) {
|
||||
self.0.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn dec(&self) {
|
||||
self.0.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn get(&self) -> u64 {
|
||||
self.0.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
/// Histogram for tracking distributions (simple bucket-based).
|
||||
#[derive(Debug)]
|
||||
pub struct Histogram {
|
||||
/// Bucket boundaries (upper limits).
|
||||
buckets: Vec<u64>,
|
||||
/// Count in each bucket.
|
||||
counts: Vec<AtomicU64>,
|
||||
/// Sum of all values.
|
||||
sum: AtomicU64,
|
||||
/// Total count.
|
||||
count: AtomicU64,
|
||||
}
|
||||
|
||||
impl Histogram {
|
||||
/// Create with default latency buckets (ms).
|
||||
pub fn latency_ms() -> Self {
|
||||
Self::new(vec![1, 5, 10, 25, 50, 100, 250, 500, 1000, 5000, 10000])
|
||||
}
|
||||
|
||||
/// Create with default size buckets (bytes).
|
||||
pub fn size_bytes() -> Self {
|
||||
Self::new(vec![64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 65536])
|
||||
}
|
||||
|
||||
pub fn new(buckets: Vec<u64>) -> Self {
|
||||
let counts = buckets.iter().map(|_| AtomicU64::new(0)).collect();
|
||||
Self {
|
||||
buckets,
|
||||
counts,
|
||||
sum: AtomicU64::new(0),
|
||||
count: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn observe(&self, value: u64) {
|
||||
self.sum.fetch_add(value, Ordering::Relaxed);
|
||||
self.count.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
for (i, &upper) in self.buckets.iter().enumerate() {
|
||||
if value <= upper {
|
||||
self.counts[i].fetch_add(1, Ordering::Relaxed);
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Value exceeds all buckets — count in last
|
||||
if let Some(last) = self.counts.last() {
|
||||
last.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn observe_duration(&self, d: Duration) {
|
||||
self.observe(d.as_millis() as u64);
|
||||
}
|
||||
|
||||
pub fn sum(&self) -> u64 {
|
||||
self.sum.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
pub fn count(&self) -> u64 {
|
||||
self.count.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
pub fn avg(&self) -> f64 {
|
||||
let count = self.count();
|
||||
if count == 0 {
|
||||
0.0
|
||||
} else {
|
||||
self.sum() as f64 / count as f64
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-transport metrics.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct TransportMetrics {
|
||||
/// Messages sent successfully.
|
||||
pub sent: Counter,
|
||||
/// Messages received.
|
||||
pub received: Counter,
|
||||
/// Send failures.
|
||||
pub send_errors: Counter,
|
||||
/// Receive errors.
|
||||
pub recv_errors: Counter,
|
||||
/// Bytes sent.
|
||||
pub bytes_sent: Counter,
|
||||
/// Bytes received.
|
||||
pub bytes_received: Counter,
|
||||
/// Active connections (for connection-oriented transports).
|
||||
pub connections: Gauge,
|
||||
}
|
||||
|
||||
/// Per-peer metrics.
|
||||
#[derive(Debug)]
|
||||
pub struct PeerMetrics {
|
||||
/// Messages sent to this peer.
|
||||
pub messages_sent: Counter,
|
||||
/// Messages received from this peer.
|
||||
pub messages_received: Counter,
|
||||
/// Last seen timestamp.
|
||||
pub last_seen: RwLock<Option<Instant>>,
|
||||
/// Round-trip time samples.
|
||||
pub rtt_ms: Histogram,
|
||||
}
|
||||
|
||||
impl Default for PeerMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
messages_sent: Counter::new(),
|
||||
messages_received: Counter::new(),
|
||||
last_seen: RwLock::new(None),
|
||||
rtt_ms: Histogram::latency_ms(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PeerMetrics {
|
||||
pub fn touch(&self) {
|
||||
if let Ok(mut last) = self.last_seen.write() {
|
||||
*last = Some(Instant::now());
|
||||
}
|
||||
}
|
||||
|
||||
pub fn age(&self) -> Option<Duration> {
|
||||
self.last_seen
|
||||
.read()
|
||||
.ok()
|
||||
.and_then(|t| t.map(|i| i.elapsed()))
|
||||
}
|
||||
}
|
||||
|
||||
/// Global mesh metrics.
|
||||
#[derive(Debug)]
|
||||
pub struct MeshMetrics {
|
||||
/// Transport metrics by name.
|
||||
pub transports: RwLock<HashMap<String, Arc<TransportMetrics>>>,
|
||||
/// Routing metrics.
|
||||
pub routing: RoutingMetrics,
|
||||
/// Store metrics.
|
||||
pub store: StoreMetrics,
|
||||
/// Crypto metrics.
|
||||
pub crypto: CryptoMetrics,
|
||||
/// Protocol metrics.
|
||||
pub protocol: ProtocolMetrics,
|
||||
/// Node start time.
|
||||
pub started_at: Instant,
|
||||
}
|
||||
|
||||
impl Default for MeshMetrics {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl MeshMetrics {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
transports: RwLock::new(HashMap::new()),
|
||||
routing: RoutingMetrics::default(),
|
||||
store: StoreMetrics::default(),
|
||||
crypto: CryptoMetrics::default(),
|
||||
protocol: ProtocolMetrics::default(),
|
||||
started_at: Instant::now(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get or create transport metrics.
|
||||
pub fn transport(&self, name: &str) -> Arc<TransportMetrics> {
|
||||
{
|
||||
let map = self.transports.read().unwrap();
|
||||
if let Some(m) = map.get(name) {
|
||||
return Arc::clone(m);
|
||||
}
|
||||
}
|
||||
let mut map = self.transports.write().unwrap();
|
||||
map.entry(name.to_string())
|
||||
.or_insert_with(|| Arc::new(TransportMetrics::default()))
|
||||
.clone()
|
||||
}
|
||||
|
||||
/// Node uptime.
|
||||
pub fn uptime(&self) -> Duration {
|
||||
self.started_at.elapsed()
|
||||
}
|
||||
|
||||
/// Export metrics as a snapshot.
|
||||
pub fn snapshot(&self) -> MetricsSnapshot {
|
||||
let transports = self.transports.read().unwrap();
|
||||
let transport_snapshots: HashMap<String, TransportSnapshot> = transports
|
||||
.iter()
|
||||
.map(|(name, m)| {
|
||||
(
|
||||
name.clone(),
|
||||
TransportSnapshot {
|
||||
sent: m.sent.get(),
|
||||
received: m.received.get(),
|
||||
send_errors: m.send_errors.get(),
|
||||
bytes_sent: m.bytes_sent.get(),
|
||||
bytes_received: m.bytes_received.get(),
|
||||
connections: m.connections.get(),
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
MetricsSnapshot {
|
||||
uptime_secs: self.uptime().as_secs(),
|
||||
transports: transport_snapshots,
|
||||
routing: RoutingSnapshot {
|
||||
table_size: self.routing.table_size.get(),
|
||||
lookups: self.routing.lookups.get(),
|
||||
lookup_misses: self.routing.lookup_misses.get(),
|
||||
announcements_processed: self.routing.announcements_processed.get(),
|
||||
},
|
||||
store: StoreSnapshot {
|
||||
messages_stored: self.store.messages_stored.get(),
|
||||
messages_delivered: self.store.messages_delivered.get(),
|
||||
messages_expired: self.store.messages_expired.get(),
|
||||
current_size: self.store.current_size.get(),
|
||||
},
|
||||
crypto: CryptoSnapshot {
|
||||
encryptions: self.crypto.encryptions.get(),
|
||||
decryptions: self.crypto.decryptions.get(),
|
||||
signature_verifications: self.crypto.signature_verifications.get(),
|
||||
signature_failures: self.crypto.signature_failures.get(),
|
||||
replay_detections: self.crypto.replay_detections.get(),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Routing subsystem metrics.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct RoutingMetrics {
|
||||
/// Current routing table size.
|
||||
pub table_size: Gauge,
|
||||
/// Route lookups.
|
||||
pub lookups: Counter,
|
||||
/// Route lookup misses.
|
||||
pub lookup_misses: Counter,
|
||||
/// Routes added.
|
||||
pub routes_added: Counter,
|
||||
/// Routes expired.
|
||||
pub routes_expired: Counter,
|
||||
/// Announcements processed.
|
||||
pub announcements_processed: Counter,
|
||||
/// Announcements forwarded.
|
||||
pub announcements_forwarded: Counter,
|
||||
/// Duplicate announcements dropped.
|
||||
pub duplicates_dropped: Counter,
|
||||
}
|
||||
|
||||
/// Store subsystem metrics.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct StoreMetrics {
|
||||
/// Messages stored.
|
||||
pub messages_stored: Counter,
|
||||
/// Messages delivered.
|
||||
pub messages_delivered: Counter,
|
||||
/// Messages expired.
|
||||
pub messages_expired: Counter,
|
||||
/// Current store size.
|
||||
pub current_size: Gauge,
|
||||
/// Store capacity reached events.
|
||||
pub capacity_reached: Counter,
|
||||
}
|
||||
|
||||
/// Crypto subsystem metrics.
|
||||
#[derive(Debug)]
|
||||
pub struct CryptoMetrics {
|
||||
/// Successful encryptions.
|
||||
pub encryptions: Counter,
|
||||
/// Successful decryptions.
|
||||
pub decryptions: Counter,
|
||||
/// Decryption failures.
|
||||
pub decryption_failures: Counter,
|
||||
/// Signature verifications.
|
||||
pub signature_verifications: Counter,
|
||||
/// Signature failures.
|
||||
pub signature_failures: Counter,
|
||||
/// Replay attacks detected.
|
||||
pub replay_detections: Counter,
|
||||
/// Encryption latency.
|
||||
pub encrypt_latency: Histogram,
|
||||
}
|
||||
|
||||
impl Default for CryptoMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
encryptions: Counter::new(),
|
||||
decryptions: Counter::new(),
|
||||
decryption_failures: Counter::new(),
|
||||
signature_verifications: Counter::new(),
|
||||
signature_failures: Counter::new(),
|
||||
replay_detections: Counter::new(),
|
||||
encrypt_latency: Histogram::latency_ms(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Protocol metrics.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct ProtocolMetrics {
|
||||
/// Messages parsed.
|
||||
pub messages_parsed: Counter,
|
||||
/// Parse errors.
|
||||
pub parse_errors: Counter,
|
||||
/// Unknown message types.
|
||||
pub unknown_types: Counter,
|
||||
/// Messages too large.
|
||||
pub oversized: Counter,
|
||||
}
|
||||
|
||||
/// Point-in-time snapshot of metrics.
|
||||
#[derive(Debug, Clone, serde::Serialize)]
|
||||
pub struct MetricsSnapshot {
|
||||
pub uptime_secs: u64,
|
||||
pub transports: HashMap<String, TransportSnapshot>,
|
||||
pub routing: RoutingSnapshot,
|
||||
pub store: StoreSnapshot,
|
||||
pub crypto: CryptoSnapshot,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize)]
|
||||
pub struct TransportSnapshot {
|
||||
pub sent: u64,
|
||||
pub received: u64,
|
||||
pub send_errors: u64,
|
||||
pub bytes_sent: u64,
|
||||
pub bytes_received: u64,
|
||||
pub connections: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize)]
|
||||
pub struct RoutingSnapshot {
|
||||
pub table_size: u64,
|
||||
pub lookups: u64,
|
||||
pub lookup_misses: u64,
|
||||
pub announcements_processed: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize)]
|
||||
pub struct StoreSnapshot {
|
||||
pub messages_stored: u64,
|
||||
pub messages_delivered: u64,
|
||||
pub messages_expired: u64,
|
||||
pub current_size: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize)]
|
||||
pub struct CryptoSnapshot {
|
||||
pub encryptions: u64,
|
||||
pub decryptions: u64,
|
||||
pub signature_verifications: u64,
|
||||
pub signature_failures: u64,
|
||||
pub replay_detections: u64,
|
||||
}
|
||||
|
||||
/// Global metrics instance.
|
||||
static GLOBAL_METRICS: std::sync::OnceLock<Arc<MeshMetrics>> = std::sync::OnceLock::new();
|
||||
|
||||
/// Get the global metrics instance.
|
||||
pub fn metrics() -> &'static Arc<MeshMetrics> {
|
||||
GLOBAL_METRICS.get_or_init(|| Arc::new(MeshMetrics::new()))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn counter_basics() {
|
||||
let c = Counter::new();
|
||||
assert_eq!(c.get(), 0);
|
||||
c.inc();
|
||||
assert_eq!(c.get(), 1);
|
||||
c.inc_by(5);
|
||||
assert_eq!(c.get(), 6);
|
||||
let old = c.reset();
|
||||
assert_eq!(old, 6);
|
||||
assert_eq!(c.get(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn gauge_basics() {
|
||||
let g = Gauge::new();
|
||||
assert_eq!(g.get(), 0);
|
||||
g.set(10);
|
||||
assert_eq!(g.get(), 10);
|
||||
g.inc();
|
||||
assert_eq!(g.get(), 11);
|
||||
g.dec();
|
||||
assert_eq!(g.get(), 10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_basics() {
|
||||
let h = Histogram::new(vec![10, 50, 100]);
|
||||
h.observe(5);
|
||||
h.observe(25);
|
||||
h.observe(75);
|
||||
h.observe(200);
|
||||
|
||||
assert_eq!(h.count(), 4);
|
||||
assert_eq!(h.sum(), 5 + 25 + 75 + 200);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn transport_metrics() {
|
||||
let m = MeshMetrics::new();
|
||||
let tcp = m.transport("tcp");
|
||||
tcp.sent.inc();
|
||||
tcp.bytes_sent.inc_by(100);
|
||||
|
||||
assert_eq!(tcp.sent.get(), 1);
|
||||
assert_eq!(tcp.bytes_sent.get(), 100);
|
||||
|
||||
// Same name returns same instance
|
||||
let tcp2 = m.transport("tcp");
|
||||
assert_eq!(tcp2.sent.get(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn snapshot_serializes() {
|
||||
let m = MeshMetrics::new();
|
||||
m.transport("tcp").sent.inc();
|
||||
m.routing.lookups.inc_by(10);
|
||||
|
||||
let snapshot = m.snapshot();
|
||||
let json = serde_json::to_string(&snapshot).expect("serialize");
|
||||
assert!(json.contains("\"uptime_secs\":"));
|
||||
assert!(json.contains("\"lookups\":10"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn global_metrics() {
|
||||
let m = metrics();
|
||||
m.protocol.messages_parsed.inc();
|
||||
assert_eq!(metrics().protocol.messages_parsed.get(), 1);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user