tokio_quiche/metrics/
mod.rs

1// Copyright (C) 2025, Cloudflare, Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright notice,
9//       this list of conditions and the following disclaimer.
10//
11//     * Redistributions in binary form must reproduce the above copyright
12//       notice, this list of conditions and the following disclaimer in the
13//       documentation and/or other materials provided with the distribution.
14//
15// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
16// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
17// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
19// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27//! Metrics collected across QUIC connections.
28
29pub mod labels;
30pub mod tokio_task;
31
32use foundations::telemetry::metrics::metrics;
33use foundations::telemetry::metrics::Counter;
34use foundations::telemetry::metrics::Gauge;
35use foundations::telemetry::metrics::Histogram;
36use foundations::telemetry::metrics::HistogramBuilder;
37use foundations::telemetry::metrics::TimeHistogram;
38use std::net::IpAddr;
39use std::sync::Arc;
40
41/// Trait to direct the metrics emitted by the crate to a Prometheus registry.
42pub trait Metrics: Send + Sync + Clone + Unpin + 'static {
43    /// Number of QUIC connections currently in memory
44    fn connections_in_memory(&self) -> Gauge;
45
46    /// Maximum number of writable QUIC streams in a connection
47    fn maximum_writable_streams(&self) -> Histogram;
48
49    /// Overhead of QUIC handshake processing stage
50    fn handshake_time_seconds(
51        &self, stage: labels::QuicHandshakeStage,
52    ) -> TimeHistogram;
53
54    /// Number of error and partial writes while sending QUIC packets
55    fn write_errors(&self, reason: labels::QuicWriteError) -> Counter;
56
57    /// Number of QUIC packets received where the CID could not be verified.
58    fn invalid_cid_packet_count(&self, reason: crate::BoxError) -> Counter;
59
60    /// Number of accepted QUIC Initial packets
61    fn accepted_initial_packet_count(&self) -> Counter;
62
63    /// Number of accepted QUIC Initial packets using expensive label(s)
64    fn expensive_accepted_initial_packet_count(&self, peer_ip: IpAddr)
65        -> Counter;
66
67    /// Number of QUIC packets received but not associated with an active
68    /// connection
69    fn rejected_initial_packet_count(
70        &self, reason: labels::QuicInvalidInitialPacketError,
71    ) -> Counter;
72
73    /// Number of QUIC packets received but not associated with an active
74    /// connection using expensive label(s)
75    fn expensive_rejected_initial_packet_count(
76        &self, reason: labels::QuicInvalidInitialPacketError, peer_ip: IpAddr,
77    ) -> Counter;
78
79    /// Combined utilized bandwidth of all open connections (max over the past
80    /// two minutes)
81    fn utilized_bandwidth(&self) -> Gauge;
82
83    /// The highest utilized bandwidh reported during the lifetime of the
84    /// connection
85    fn max_bandwidth_mbps(&self) -> Histogram;
86
87    /// The highest momentary loss reported during the lifetime of the
88    /// connection
89    fn max_loss_pct(&self) -> Histogram;
90
91    /// Number of UDP packets dropped when receiving
92    fn udp_drop_count(&self) -> Counter;
93
94    /// Number of failed quic handshakes
95    fn failed_handshakes(&self, reason: labels::HandshakeError) -> Counter;
96
97    /// Number of HTTP/3 connection closures generated locally
98    fn local_h3_conn_close_error_count(&self, reason: labels::H3Error)
99        -> Counter;
100
101    /// Number of QUIC connection closures generated locally
102    fn local_quic_conn_close_error_count(
103        &self, reason: labels::QuicError,
104    ) -> Counter;
105
106    /// Number of HTTP/3 connection closures generated by peer
107    fn peer_h3_conn_close_error_count(&self, reason: labels::H3Error) -> Counter;
108
109    /// Number of QUIC connection closures generated by peer
110    fn peer_quic_conn_close_error_count(
111        &self, reason: labels::QuicError,
112    ) -> Counter;
113
114    // ==== tokio runtime metrics ====
115
116    /// Histogram of task schedule delays
117    fn tokio_runtime_task_schedule_delay_histogram(
118        &self, task: &Arc<str>,
119    ) -> TimeHistogram;
120
121    /// Histogram of task poll durations
122    fn tokio_runtime_task_poll_duration_histogram(
123        &self, task: &Arc<str>,
124    ) -> TimeHistogram;
125
126    /// Helps us get a rough idea of if our waker is causing issues.
127    fn tokio_runtime_task_total_poll_time_micros(
128        &self, task: &Arc<str>,
129    ) -> Counter;
130}
131
132/// Standard implementation of [`Metrics`] using
133/// [`foundations::telemetry::metrics`].
134#[derive(Default, Clone)]
135pub struct DefaultMetrics;
136
137impl Metrics for DefaultMetrics {
138    fn connections_in_memory(&self) -> Gauge {
139        quic::connections_in_memory()
140    }
141
142    fn maximum_writable_streams(&self) -> Histogram {
143        quic::maximum_writable_streams()
144    }
145
146    fn handshake_time_seconds(
147        &self, stage: labels::QuicHandshakeStage,
148    ) -> TimeHistogram {
149        quic::handshake_time_seconds(stage)
150    }
151
152    fn write_errors(&self, reason: labels::QuicWriteError) -> Counter {
153        quic::write_errors(reason)
154    }
155
156    fn invalid_cid_packet_count(&self, reason: crate::BoxError) -> Counter {
157        quic::invalid_cid_packet_count(reason.to_string())
158    }
159
160    fn accepted_initial_packet_count(&self) -> Counter {
161        quic::accepted_initial_packet_count()
162    }
163
164    fn expensive_accepted_initial_packet_count(
165        &self, peer_ip: IpAddr,
166    ) -> Counter {
167        quic::expensive_accepted_initial_packet_count(peer_ip)
168    }
169
170    fn rejected_initial_packet_count(
171        &self, reason: labels::QuicInvalidInitialPacketError,
172    ) -> Counter {
173        quic::rejected_initial_packet_count(reason)
174    }
175
176    fn expensive_rejected_initial_packet_count(
177        &self, reason: labels::QuicInvalidInitialPacketError, peer_ip: IpAddr,
178    ) -> Counter {
179        quic::expensive_rejected_initial_packet_count(reason, peer_ip)
180    }
181
182    fn utilized_bandwidth(&self) -> Gauge {
183        quic::utilized_bandwidth()
184    }
185
186    fn max_bandwidth_mbps(&self) -> Histogram {
187        quic::max_bandwidth_mbps()
188    }
189
190    fn max_loss_pct(&self) -> Histogram {
191        quic::max_loss_pct()
192    }
193
194    fn udp_drop_count(&self) -> Counter {
195        quic::udp_drop_count()
196    }
197
198    fn failed_handshakes(&self, reason: labels::HandshakeError) -> Counter {
199        quic::failed_handshakes(reason)
200    }
201
202    fn local_h3_conn_close_error_count(
203        &self, reason: labels::H3Error,
204    ) -> Counter {
205        quic::local_h3_conn_close_error_count(reason)
206    }
207
208    fn local_quic_conn_close_error_count(
209        &self, reason: labels::QuicError,
210    ) -> Counter {
211        quic::local_quic_conn_close_error_count(reason)
212    }
213
214    fn peer_h3_conn_close_error_count(&self, reason: labels::H3Error) -> Counter {
215        quic::peer_h3_conn_close_error_count(reason)
216    }
217
218    fn peer_quic_conn_close_error_count(
219        &self, reason: labels::QuicError,
220    ) -> Counter {
221        quic::peer_quic_conn_close_error_count(reason)
222    }
223
224    // ==== tokio runtime metrics ====
225
226    /// Histogram of task schedule delays
227    fn tokio_runtime_task_schedule_delay_histogram(
228        &self, task: &Arc<str>,
229    ) -> TimeHistogram {
230        tokio::runtime_task_schedule_delay_histogram(task)
231    }
232
233    /// Histogram of task poll durations
234    fn tokio_runtime_task_poll_duration_histogram(
235        &self, task: &Arc<str>,
236    ) -> TimeHistogram {
237        tokio::runtime_task_poll_duration_histogram(task)
238    }
239
240    /// Helps us get a rough idea of if our waker is causing issues.
241    fn tokio_runtime_task_total_poll_time_micros(
242        &self, task: &Arc<str>,
243    ) -> Counter {
244        tokio::runtime_task_total_poll_time_micros(task)
245    }
246}
247
248#[metrics]
249pub(crate) mod quic {
250    /// Number of QUIC connections currently in memory
251    pub fn connections_in_memory() -> Gauge;
252
253    /// Maximum number of writable QUIC streams in a connection
254    #[optional]
255    #[ctor = HistogramBuilder { buckets: &[0.0, 5.0, 10.0, 100.0, 1000.0, 2000.0, 3000.0, 10000.0, 20000.0, 50000.0], }]
256    pub fn maximum_writable_streams() -> Histogram;
257
258    /// Overhead of QUIC handshake processing stage
259    #[ctor = HistogramBuilder { buckets: &[1E-5, 2E-5, 5E-5, 1E-4, 2E-4, 5E-4, 1E-3, 2E-3, 5E-3, 1E-2, 2E-2, 5E-2, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0], }]
260    pub fn handshake_time_seconds(
261        stage: labels::QuicHandshakeStage,
262    ) -> TimeHistogram;
263
264    /// Number of error and partial writes while sending QUIC packets
265    pub fn write_errors(reason: labels::QuicWriteError) -> Counter;
266
267    /// Number of QUIC packets received where the CID could not be verified.
268    pub fn invalid_cid_packet_count(reason: String) -> Counter;
269
270    /// Number of accepted QUIC Initial packets
271    pub fn accepted_initial_packet_count() -> Counter;
272
273    /// Number of accepted QUIC Initial packets using expensive label(s)
274    #[optional]
275    pub fn expensive_accepted_initial_packet_count(peer_ip: IpAddr) -> Counter;
276
277    /// Number of QUIC packets received but not associated with an active
278    /// connection
279    pub fn rejected_initial_packet_count(
280        reason: labels::QuicInvalidInitialPacketError,
281    ) -> Counter;
282
283    /// Number of QUIC packets received but not associated with an active
284    /// connection using expensive label(s)
285    #[optional]
286    pub fn expensive_rejected_initial_packet_count(
287        reason: labels::QuicInvalidInitialPacketError, peer_ip: IpAddr,
288    ) -> Counter;
289
290    /// Combined utilized bandwidth of all open connections (max over the past
291    /// two minutes)
292    pub fn utilized_bandwidth() -> Gauge;
293
294    /// The highest utilized bandwidh reported during the lifetime of the
295    /// connection
296    #[ctor = HistogramBuilder { buckets: &[0., 1., 2., 5., 10., 20., 50., 100., 200., 300., 500., 750., 1000., 1500., 2000., 2500., 3000., 3500., 4000., 4500., 5000., 6000., 7000., 10000.], }]
297    pub fn max_bandwidth_mbps() -> Histogram;
298
299    /// The highest momentary loss reported during the lifetime of the
300    /// connection
301    #[ctor = HistogramBuilder { buckets: &[0.0, 0.1, 0.2, 0.5, 1., 2., 3., 4., 5., 10., 15., 20., 25., 50., 100.], }]
302    pub fn max_loss_pct() -> Histogram;
303
304    /// Number of UDP packets dropped when receiving
305    pub fn udp_drop_count() -> Counter;
306
307    /// Number of failed quic handshakes
308    pub fn failed_handshakes(reason: labels::HandshakeError) -> Counter;
309
310    /// Number of HTTP/3 connection closures generated locally
311    pub fn local_h3_conn_close_error_count(reason: labels::H3Error) -> Counter;
312
313    /// Number of QUIC connection closures generated locally
314    pub fn local_quic_conn_close_error_count(
315        reason: labels::QuicError,
316    ) -> Counter;
317
318    /// Number of HTTP/3 connection closures generated by peer
319    pub fn peer_h3_conn_close_error_count(reason: labels::H3Error) -> Counter;
320
321    /// Number of QUIC connection closures generated by peer
322    pub fn peer_quic_conn_close_error_count(reason: labels::QuicError)
323        -> Counter;
324}
325
326#[metrics]
327mod tokio {
328    /// Histogram of task schedule delays
329    #[ctor = HistogramBuilder { buckets: &[0.0, 1E-4, 2E-4, 3E-4, 4E-4, 5E-4, 6E-4, 7E-4, 8E-4, 9E-4, 1E-3, 1E-2, 2E-2, 4E-2, 8E-2, 1E-1, 1.0], }]
330    pub fn runtime_task_schedule_delay_histogram(
331        task: &Arc<str>,
332    ) -> TimeHistogram;
333
334    /// Histogram of task poll durations
335    #[ctor = HistogramBuilder { buckets: &[0.0, 1E-4, 2E-4, 3E-4, 4E-4, 5E-4, 6E-4, 7E-4, 8E-4, 9E-4, 1E-3, 1E-2, 2E-2, 4E-2, 8E-2, 1E-1, 1.0], }]
336    pub fn runtime_task_poll_duration_histogram(task: &Arc<str>)
337        -> TimeHistogram;
338
339    /// Helps us get a rough idea of if our waker is causing issues.
340    pub fn runtime_task_total_poll_time_micros(task: &Arc<str>) -> Counter;
341}
342
343pub(crate) fn quic_expensive_metrics_ip_reduce(ip: IpAddr) -> Option<IpAddr> {
344    const QUIC_INITIAL_METRICS_V4_PREFIX: u8 = 20;
345    const QUIC_INITIAL_METRICS_V6_PREFIX: u8 = 32;
346
347    let prefix = if ip.is_ipv4() {
348        QUIC_INITIAL_METRICS_V4_PREFIX
349    } else {
350        QUIC_INITIAL_METRICS_V6_PREFIX
351    };
352
353    if let Ok(ip_net) = ipnetwork::IpNetwork::new(ip, prefix) {
354        Some(ip_net.network())
355    } else {
356        None
357    }
358}