tokio_quiche/metrics/
mod.rs

1// Copyright (C) 2025, Cloudflare, Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright notice,
9//       this list of conditions and the following disclaimer.
10//
11//     * Redistributions in binary form must reproduce the above copyright
12//       notice, this list of conditions and the following disclaimer in the
13//       documentation and/or other materials provided with the distribution.
14//
15// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
16// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
17// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
19// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27//! Metrics collected across QUIC connections.
28
29pub mod labels;
30pub mod tokio_task;
31
32use foundations::telemetry::metrics::metrics;
33use foundations::telemetry::metrics::Counter;
34use foundations::telemetry::metrics::Gauge;
35use foundations::telemetry::metrics::Histogram;
36use foundations::telemetry::metrics::HistogramBuilder;
37use foundations::telemetry::metrics::TimeHistogram;
38use std::net::IpAddr;
39use std::sync::Arc;
40
41/// Trait to direct the metrics emitted by the crate to a Prometheus registry.
42pub trait Metrics: Send + Sync + Clone + Unpin + 'static {
43    /// Number of QUIC connections currently in memory
44    fn connections_in_memory(&self) -> Gauge;
45
46    /// Maximum number of writable QUIC streams in a connection
47    fn maximum_writable_streams(&self) -> Histogram;
48
49    /// Overhead of QUIC handshake processing stage
50    fn handshake_time_seconds(
51        &self, stage: labels::QuicHandshakeStage,
52    ) -> TimeHistogram;
53
54    /// Number of error and partial writes while sending QUIC packets
55    fn write_errors(&self, reason: labels::QuicWriteError) -> Counter;
56
57    /// Record timing information from sendmsg calls that return
58    /// WouldBlock and are retried in a loop.
59    fn send_to_wouldblock_duration_s(&self) -> TimeHistogram;
60
61    /// Number of QUIC packets received where the CID could not be verified.
62    fn invalid_cid_packet_count(&self, reason: crate::BoxError) -> Counter;
63
64    /// Number of accepted QUIC Initial packets
65    fn accepted_initial_packet_count(&self) -> Counter;
66
67    /// Number of accepted QUIC Initial packets using expensive label(s)
68    fn expensive_accepted_initial_packet_count(&self, peer_ip: IpAddr)
69        -> Counter;
70
71    /// Number of QUIC packets received but not associated with an active
72    /// connection
73    fn rejected_initial_packet_count(
74        &self, reason: labels::QuicInvalidInitialPacketError,
75    ) -> Counter;
76
77    /// Number of QUIC packets received but not associated with an active
78    /// connection using expensive label(s)
79    fn expensive_rejected_initial_packet_count(
80        &self, reason: labels::QuicInvalidInitialPacketError, peer_ip: IpAddr,
81    ) -> Counter;
82
83    /// Combined utilized bandwidth of all open connections (max over the past
84    /// two minutes)
85    fn utilized_bandwidth(&self) -> Gauge;
86
87    /// The highest utilized bandwidh reported during the lifetime of the
88    /// connection
89    fn max_bandwidth_mbps(&self) -> Histogram;
90
91    /// The highest momentary loss reported during the lifetime of the
92    /// connection
93    fn max_loss_pct(&self) -> Histogram;
94
95    /// Number of UDP packets dropped when receiving
96    fn udp_drop_count(&self) -> Counter;
97
98    /// Number of failed quic handshakes
99    fn failed_handshakes(&self, reason: labels::HandshakeError) -> Counter;
100
101    /// Number of HTTP/3 connection closures generated locally
102    fn local_h3_conn_close_error_count(&self, reason: labels::H3Error)
103        -> Counter;
104
105    /// Number of QUIC connection closures generated locally
106    fn local_quic_conn_close_error_count(
107        &self, reason: labels::QuicError,
108    ) -> Counter;
109
110    /// Number of HTTP/3 connection closures generated by peer
111    fn peer_h3_conn_close_error_count(&self, reason: labels::H3Error) -> Counter;
112
113    /// Number of QUIC connection closures generated by peer
114    fn peer_quic_conn_close_error_count(
115        &self, reason: labels::QuicError,
116    ) -> Counter;
117
118    // ==== tokio runtime metrics ====
119
120    /// Histogram of task schedule delays
121    fn tokio_runtime_task_schedule_delay_histogram(
122        &self, task: &Arc<str>,
123    ) -> TimeHistogram;
124
125    /// Histogram of task poll durations
126    fn tokio_runtime_task_poll_duration_histogram(
127        &self, task: &Arc<str>,
128    ) -> TimeHistogram;
129
130    /// Helps us get a rough idea of if our waker is causing issues.
131    fn tokio_runtime_task_total_poll_time_micros(
132        &self, task: &Arc<str>,
133    ) -> Counter;
134}
135
136/// Standard implementation of [`Metrics`] using
137/// [`foundations::telemetry::metrics`].
138#[derive(Default, Clone)]
139pub struct DefaultMetrics;
140
141impl Metrics for DefaultMetrics {
142    fn connections_in_memory(&self) -> Gauge {
143        quic::connections_in_memory()
144    }
145
146    fn maximum_writable_streams(&self) -> Histogram {
147        quic::maximum_writable_streams()
148    }
149
150    fn handshake_time_seconds(
151        &self, stage: labels::QuicHandshakeStage,
152    ) -> TimeHistogram {
153        quic::handshake_time_seconds(stage)
154    }
155
156    fn write_errors(&self, reason: labels::QuicWriteError) -> Counter {
157        quic::write_errors(reason)
158    }
159
160    fn send_to_wouldblock_duration_s(&self) -> TimeHistogram {
161        quic::send_to_wouldblock_duration_s()
162    }
163
164    fn invalid_cid_packet_count(&self, reason: crate::BoxError) -> Counter {
165        quic::invalid_cid_packet_count(reason.to_string())
166    }
167
168    fn accepted_initial_packet_count(&self) -> Counter {
169        quic::accepted_initial_packet_count()
170    }
171
172    fn expensive_accepted_initial_packet_count(
173        &self, peer_ip: IpAddr,
174    ) -> Counter {
175        quic::expensive_accepted_initial_packet_count(peer_ip)
176    }
177
178    fn rejected_initial_packet_count(
179        &self, reason: labels::QuicInvalidInitialPacketError,
180    ) -> Counter {
181        quic::rejected_initial_packet_count(reason)
182    }
183
184    fn expensive_rejected_initial_packet_count(
185        &self, reason: labels::QuicInvalidInitialPacketError, peer_ip: IpAddr,
186    ) -> Counter {
187        quic::expensive_rejected_initial_packet_count(reason, peer_ip)
188    }
189
190    fn utilized_bandwidth(&self) -> Gauge {
191        quic::utilized_bandwidth()
192    }
193
194    fn max_bandwidth_mbps(&self) -> Histogram {
195        quic::max_bandwidth_mbps()
196    }
197
198    fn max_loss_pct(&self) -> Histogram {
199        quic::max_loss_pct()
200    }
201
202    fn udp_drop_count(&self) -> Counter {
203        quic::udp_drop_count()
204    }
205
206    fn failed_handshakes(&self, reason: labels::HandshakeError) -> Counter {
207        quic::failed_handshakes(reason)
208    }
209
210    fn local_h3_conn_close_error_count(
211        &self, reason: labels::H3Error,
212    ) -> Counter {
213        quic::local_h3_conn_close_error_count(reason)
214    }
215
216    fn local_quic_conn_close_error_count(
217        &self, reason: labels::QuicError,
218    ) -> Counter {
219        quic::local_quic_conn_close_error_count(reason)
220    }
221
222    fn peer_h3_conn_close_error_count(&self, reason: labels::H3Error) -> Counter {
223        quic::peer_h3_conn_close_error_count(reason)
224    }
225
226    fn peer_quic_conn_close_error_count(
227        &self, reason: labels::QuicError,
228    ) -> Counter {
229        quic::peer_quic_conn_close_error_count(reason)
230    }
231
232    // ==== tokio runtime metrics ====
233
234    /// Histogram of task schedule delays
235    fn tokio_runtime_task_schedule_delay_histogram(
236        &self, task: &Arc<str>,
237    ) -> TimeHistogram {
238        tokio::runtime_task_schedule_delay_histogram(task)
239    }
240
241    /// Histogram of task poll durations
242    fn tokio_runtime_task_poll_duration_histogram(
243        &self, task: &Arc<str>,
244    ) -> TimeHistogram {
245        tokio::runtime_task_poll_duration_histogram(task)
246    }
247
248    /// Helps us get a rough idea of if our waker is causing issues.
249    fn tokio_runtime_task_total_poll_time_micros(
250        &self, task: &Arc<str>,
251    ) -> Counter {
252        tokio::runtime_task_total_poll_time_micros(task)
253    }
254}
255
256#[metrics]
257pub(crate) mod quic {
258    /// Number of QUIC connections currently in memory
259    pub fn connections_in_memory() -> Gauge;
260
261    /// Maximum number of writable QUIC streams in a connection
262    #[optional]
263    #[ctor = HistogramBuilder { buckets: &[0.0, 5.0, 10.0, 100.0, 1000.0, 2000.0, 3000.0, 10000.0, 20000.0, 50000.0], }]
264    pub fn maximum_writable_streams() -> Histogram;
265
266    /// Overhead of QUIC handshake processing stage
267    #[ctor = HistogramBuilder { buckets: &[1E-5, 2E-5, 5E-5, 1E-4, 2E-4, 5E-4, 1E-3, 2E-3, 5E-3, 1E-2, 2E-2, 5E-2, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0], }]
268    pub fn handshake_time_seconds(
269        stage: labels::QuicHandshakeStage,
270    ) -> TimeHistogram;
271
272    /// Number of error and partial writes while sending QUIC packets
273    pub fn write_errors(reason: labels::QuicWriteError) -> Counter;
274
275    /// Record timing information from sendmsg calls that return
276    /// WouldBlock and are retried in a loop.
277    #[ctor = HistogramBuilder { buckets: &[1E-6, 1E-5, 1E-4, 1E-3, 5E-3, 1E-2, 2E-2, 4E-2, 8E-2, 16E-2, 1.0], }]
278    pub fn send_to_wouldblock_duration_s() -> TimeHistogram;
279
280    /// Number of QUIC packets received where the CID could not be verified.
281    pub fn invalid_cid_packet_count(reason: String) -> Counter;
282
283    /// Number of accepted QUIC Initial packets
284    pub fn accepted_initial_packet_count() -> Counter;
285
286    /// Number of accepted QUIC Initial packets using expensive label(s)
287    #[optional]
288    pub fn expensive_accepted_initial_packet_count(peer_ip: IpAddr) -> Counter;
289
290    /// Number of QUIC packets received but not associated with an active
291    /// connection
292    pub fn rejected_initial_packet_count(
293        reason: labels::QuicInvalidInitialPacketError,
294    ) -> Counter;
295
296    /// Number of QUIC packets received but not associated with an active
297    /// connection using expensive label(s)
298    #[optional]
299    pub fn expensive_rejected_initial_packet_count(
300        reason: labels::QuicInvalidInitialPacketError, peer_ip: IpAddr,
301    ) -> Counter;
302
303    /// Combined utilized bandwidth of all open connections (max over the past
304    /// two minutes)
305    pub fn utilized_bandwidth() -> Gauge;
306
307    /// The highest utilized bandwidh reported during the lifetime of the
308    /// connection
309    #[ctor = HistogramBuilder { buckets: &[0., 1., 2., 5., 10., 20., 50., 100., 200., 300., 500., 750., 1000., 1500., 2000., 2500., 3000., 3500., 4000., 4500., 5000., 6000., 7000., 10000.], }]
310    pub fn max_bandwidth_mbps() -> Histogram;
311
312    /// The highest momentary loss reported during the lifetime of the
313    /// connection
314    #[ctor = HistogramBuilder { buckets: &[0.0, 0.1, 0.2, 0.5, 1., 2., 3., 4., 5., 10., 15., 20., 25., 50., 100.], }]
315    pub fn max_loss_pct() -> Histogram;
316
317    /// Number of UDP packets dropped when receiving
318    pub fn udp_drop_count() -> Counter;
319
320    /// Number of failed quic handshakes
321    pub fn failed_handshakes(reason: labels::HandshakeError) -> Counter;
322
323    /// Number of HTTP/3 connection closures generated locally
324    pub fn local_h3_conn_close_error_count(reason: labels::H3Error) -> Counter;
325
326    /// Number of QUIC connection closures generated locally
327    pub fn local_quic_conn_close_error_count(
328        reason: labels::QuicError,
329    ) -> Counter;
330
331    /// Number of HTTP/3 connection closures generated by peer
332    pub fn peer_h3_conn_close_error_count(reason: labels::H3Error) -> Counter;
333
334    /// Number of QUIC connection closures generated by peer
335    pub fn peer_quic_conn_close_error_count(reason: labels::QuicError)
336        -> Counter;
337}
338
339#[metrics]
340mod tokio {
341    /// Histogram of task schedule delays
342    #[ctor = HistogramBuilder { buckets: &[0.0, 1E-4, 2E-4, 3E-4, 4E-4, 5E-4, 6E-4, 7E-4, 8E-4, 9E-4, 1E-3, 1E-2, 2E-2, 4E-2, 8E-2, 1E-1, 1.0], }]
343    pub fn runtime_task_schedule_delay_histogram(
344        task: &Arc<str>,
345    ) -> TimeHistogram;
346
347    /// Histogram of task poll durations
348    #[ctor = HistogramBuilder { buckets: &[0.0, 1E-4, 2E-4, 3E-4, 4E-4, 5E-4, 6E-4, 7E-4, 8E-4, 9E-4, 1E-3, 1E-2, 2E-2, 4E-2, 8E-2, 1E-1, 1.0], }]
349    pub fn runtime_task_poll_duration_histogram(task: &Arc<str>)
350        -> TimeHistogram;
351
352    /// Helps us get a rough idea of if our waker is causing issues.
353    pub fn runtime_task_total_poll_time_micros(task: &Arc<str>) -> Counter;
354}
355
356pub(crate) fn quic_expensive_metrics_ip_reduce(ip: IpAddr) -> Option<IpAddr> {
357    const QUIC_INITIAL_METRICS_V4_PREFIX: u8 = 20;
358    const QUIC_INITIAL_METRICS_V6_PREFIX: u8 = 32;
359
360    let prefix = if ip.is_ipv4() {
361        QUIC_INITIAL_METRICS_V4_PREFIX
362    } else {
363        QUIC_INITIAL_METRICS_V6_PREFIX
364    };
365
366    if let Ok(ip_net) = ipnetwork::IpNetwork::new(ip, prefix) {
367        Some(ip_net.network())
368    } else {
369        None
370    }
371}