Skip to main content

tokio_quiche/metrics/
mod.rs

1// Copyright (C) 2025, Cloudflare, Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright notice,
9//       this list of conditions and the following disclaimer.
10//
11//     * Redistributions in binary form must reproduce the above copyright
12//       notice, this list of conditions and the following disclaimer in the
13//       documentation and/or other materials provided with the distribution.
14//
15// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
16// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
17// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
19// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27//! Metrics collected across QUIC connections.
28
29pub mod labels;
30pub mod tokio_task;
31
32use foundations::telemetry::metrics::metrics;
33use foundations::telemetry::metrics::Counter;
34use foundations::telemetry::metrics::Gauge;
35use foundations::telemetry::metrics::Histogram;
36use foundations::telemetry::metrics::HistogramBuilder;
37use foundations::telemetry::metrics::TimeHistogram;
38use std::net::IpAddr;
39use std::sync::Arc;
40
41/// Trait to direct the metrics emitted by the crate to a Prometheus registry.
42pub trait Metrics: Send + Sync + Clone + Unpin + 'static {
43    /// Number of QUIC connections currently in memory
44    fn connections_in_memory(&self) -> Gauge;
45
46    /// Maximum number of writable QUIC streams in a connection
47    fn maximum_writable_streams(&self) -> Histogram;
48
49    /// Overhead of QUIC handshake processing stage
50    fn handshake_time_seconds(
51        &self, stage: labels::QuicHandshakeStage,
52    ) -> TimeHistogram;
53
54    /// Number of error and partial writes while sending QUIC packets
55    fn write_errors(&self, reason: labels::QuicWriteError) -> Counter;
56
57    /// Record timing information from sendmsg calls that return
58    /// WouldBlock and are retried in a loop.
59    fn send_to_wouldblock_duration_s(&self) -> TimeHistogram;
60
61    /// Number of mid-handshake flush operations that were skipped due to future
62    /// cancellation.
63    fn skipped_mid_handshake_flush_count(&self) -> Counter;
64
65    /// Number of QUIC packets received where the CID could not be verified.
66    fn invalid_cid_packet_count(&self, reason: crate::BoxError) -> Counter;
67
68    /// Number of accepted QUIC Initial packets
69    fn accepted_initial_packet_count(&self) -> Counter;
70
71    /// Number of accepted QUIC Initial packets using expensive label(s)
72    fn expensive_accepted_initial_packet_count(&self, peer_ip: IpAddr)
73        -> Counter;
74
75    /// Number of QUIC packets received but not associated with an active
76    /// connection
77    fn rejected_initial_packet_count(
78        &self, reason: labels::QuicInvalidInitialPacketError,
79    ) -> Counter;
80
81    /// Number of QUIC packets received but not associated with an active
82    /// connection using expensive label(s)
83    fn expensive_rejected_initial_packet_count(
84        &self, reason: labels::QuicInvalidInitialPacketError, peer_ip: IpAddr,
85    ) -> Counter;
86
87    /// Combined utilized bandwidth of all open connections (max over the past
88    /// two minutes)
89    fn utilized_bandwidth(&self) -> Gauge;
90
91    /// The highest utilized bandwidh reported during the lifetime of the
92    /// connection
93    fn max_bandwidth_mbps(&self) -> Histogram;
94
95    /// The highest momentary loss reported during the lifetime of the
96    /// connection
97    fn max_loss_pct(&self) -> Histogram;
98
99    /// Number of UDP packets dropped when receiving
100    fn udp_drop_count(&self) -> Counter;
101
102    /// Number of failed quic handshakes
103    fn failed_handshakes(&self, reason: labels::HandshakeError) -> Counter;
104
105    /// Number of HTTP/3 connection closures generated locally
106    fn local_h3_conn_close_error_count(&self, reason: labels::H3Error)
107        -> Counter;
108
109    /// Number of QUIC connection closures generated locally
110    fn local_quic_conn_close_error_count(
111        &self, reason: labels::QuicError,
112    ) -> Counter;
113
114    /// Number of HTTP/3 connection closures generated by peer
115    fn peer_h3_conn_close_error_count(&self, reason: labels::H3Error) -> Counter;
116
117    /// Number of QUIC connection closures generated by peer
118    fn peer_quic_conn_close_error_count(
119        &self, reason: labels::QuicError,
120    ) -> Counter;
121
122    // ==== tokio runtime metrics ====
123
124    /// Histogram of task schedule delays
125    fn tokio_runtime_task_schedule_delay_histogram(
126        &self, task: &Arc<str>,
127    ) -> TimeHistogram;
128
129    /// Histogram of task poll durations
130    fn tokio_runtime_task_poll_duration_histogram(
131        &self, task: &Arc<str>,
132    ) -> TimeHistogram;
133
134    /// Helps us get a rough idea of if our waker is causing issues.
135    fn tokio_runtime_task_total_poll_time_micros(
136        &self, task: &Arc<str>,
137    ) -> Counter;
138}
139
140/// Standard implementation of [`Metrics`] using
141/// [`foundations::telemetry::metrics`].
142#[derive(Default, Clone)]
143pub struct DefaultMetrics;
144
145impl Metrics for DefaultMetrics {
146    fn connections_in_memory(&self) -> Gauge {
147        quic::connections_in_memory()
148    }
149
150    fn maximum_writable_streams(&self) -> Histogram {
151        quic::maximum_writable_streams()
152    }
153
154    fn handshake_time_seconds(
155        &self, stage: labels::QuicHandshakeStage,
156    ) -> TimeHistogram {
157        quic::handshake_time_seconds(stage)
158    }
159
160    fn write_errors(&self, reason: labels::QuicWriteError) -> Counter {
161        quic::write_errors(reason)
162    }
163
164    fn send_to_wouldblock_duration_s(&self) -> TimeHistogram {
165        quic::send_to_wouldblock_duration_s()
166    }
167
168    fn skipped_mid_handshake_flush_count(&self) -> Counter {
169        quic::skipped_mid_handshake_flush_count()
170    }
171
172    fn invalid_cid_packet_count(&self, reason: crate::BoxError) -> Counter {
173        quic::invalid_cid_packet_count(reason.to_string())
174    }
175
176    fn accepted_initial_packet_count(&self) -> Counter {
177        quic::accepted_initial_packet_count()
178    }
179
180    fn expensive_accepted_initial_packet_count(
181        &self, peer_ip: IpAddr,
182    ) -> Counter {
183        quic::expensive_accepted_initial_packet_count(peer_ip)
184    }
185
186    fn rejected_initial_packet_count(
187        &self, reason: labels::QuicInvalidInitialPacketError,
188    ) -> Counter {
189        quic::rejected_initial_packet_count(reason)
190    }
191
192    fn expensive_rejected_initial_packet_count(
193        &self, reason: labels::QuicInvalidInitialPacketError, peer_ip: IpAddr,
194    ) -> Counter {
195        quic::expensive_rejected_initial_packet_count(reason, peer_ip)
196    }
197
198    fn utilized_bandwidth(&self) -> Gauge {
199        quic::utilized_bandwidth()
200    }
201
202    fn max_bandwidth_mbps(&self) -> Histogram {
203        quic::max_bandwidth_mbps()
204    }
205
206    fn max_loss_pct(&self) -> Histogram {
207        quic::max_loss_pct()
208    }
209
210    fn udp_drop_count(&self) -> Counter {
211        quic::udp_drop_count()
212    }
213
214    fn failed_handshakes(&self, reason: labels::HandshakeError) -> Counter {
215        quic::failed_handshakes(reason)
216    }
217
218    fn local_h3_conn_close_error_count(
219        &self, reason: labels::H3Error,
220    ) -> Counter {
221        quic::local_h3_conn_close_error_count(reason)
222    }
223
224    fn local_quic_conn_close_error_count(
225        &self, reason: labels::QuicError,
226    ) -> Counter {
227        quic::local_quic_conn_close_error_count(reason)
228    }
229
230    fn peer_h3_conn_close_error_count(&self, reason: labels::H3Error) -> Counter {
231        quic::peer_h3_conn_close_error_count(reason)
232    }
233
234    fn peer_quic_conn_close_error_count(
235        &self, reason: labels::QuicError,
236    ) -> Counter {
237        quic::peer_quic_conn_close_error_count(reason)
238    }
239
240    // ==== tokio runtime metrics ====
241
242    /// Histogram of task schedule delays
243    fn tokio_runtime_task_schedule_delay_histogram(
244        &self, task: &Arc<str>,
245    ) -> TimeHistogram {
246        tokio::runtime_task_schedule_delay_histogram(task)
247    }
248
249    /// Histogram of task poll durations
250    fn tokio_runtime_task_poll_duration_histogram(
251        &self, task: &Arc<str>,
252    ) -> TimeHistogram {
253        tokio::runtime_task_poll_duration_histogram(task)
254    }
255
256    /// Helps us get a rough idea of if our waker is causing issues.
257    fn tokio_runtime_task_total_poll_time_micros(
258        &self, task: &Arc<str>,
259    ) -> Counter {
260        tokio::runtime_task_total_poll_time_micros(task)
261    }
262}
263
264#[metrics]
265pub(crate) mod quic {
266    /// Number of QUIC connections currently in memory
267    pub fn connections_in_memory() -> Gauge;
268
269    /// Maximum number of writable QUIC streams in a connection
270    #[optional]
271    #[ctor = HistogramBuilder { buckets: &[0.0, 5.0, 10.0, 100.0, 1000.0, 2000.0, 3000.0, 10000.0, 20000.0, 50000.0], }]
272    pub fn maximum_writable_streams() -> Histogram;
273
274    /// Overhead of QUIC handshake processing stage
275    #[ctor = HistogramBuilder { buckets: &[1E-5, 2E-5, 5E-5, 1E-4, 2E-4, 5E-4, 1E-3, 2E-3, 5E-3, 1E-2, 2E-2, 5E-2, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0], }]
276    pub fn handshake_time_seconds(
277        stage: labels::QuicHandshakeStage,
278    ) -> TimeHistogram;
279
280    /// Number of error and partial writes while sending QUIC packets
281    pub fn write_errors(reason: labels::QuicWriteError) -> Counter;
282
283    /// Record timing information from sendmsg calls that return
284    /// WouldBlock and are retried in a loop.
285    #[ctor = HistogramBuilder { buckets: &[1E-6, 1E-5, 1E-4, 1E-3, 5E-3, 1E-2, 2E-2, 4E-2, 8E-2, 16E-2, 1.0], }]
286    pub fn send_to_wouldblock_duration_s() -> TimeHistogram;
287
288    /// Number of mid-handshake flush operations that were skipped due to future
289    /// cancellation.
290    pub fn skipped_mid_handshake_flush_count() -> Counter;
291
292    /// Number of QUIC packets received where the CID could not be verified.
293    pub fn invalid_cid_packet_count(reason: String) -> Counter;
294
295    /// Number of accepted QUIC Initial packets
296    pub fn accepted_initial_packet_count() -> Counter;
297
298    /// Number of accepted QUIC Initial packets using expensive label(s)
299    #[optional]
300    pub fn expensive_accepted_initial_packet_count(peer_ip: IpAddr) -> Counter;
301
302    /// Number of QUIC packets received but not associated with an active
303    /// connection
304    pub fn rejected_initial_packet_count(
305        reason: labels::QuicInvalidInitialPacketError,
306    ) -> Counter;
307
308    /// Number of QUIC packets received but not associated with an active
309    /// connection using expensive label(s)
310    #[optional]
311    pub fn expensive_rejected_initial_packet_count(
312        reason: labels::QuicInvalidInitialPacketError, peer_ip: IpAddr,
313    ) -> Counter;
314
315    /// Combined utilized bandwidth of all open connections (max over the past
316    /// two minutes)
317    pub fn utilized_bandwidth() -> Gauge;
318
319    /// The highest utilized bandwidh reported during the lifetime of the
320    /// connection
321    #[ctor = HistogramBuilder { buckets: &[0., 1., 2., 5., 10., 20., 50., 100., 200., 300., 500., 750., 1000., 1500., 2000., 2500., 3000., 3500., 4000., 4500., 5000., 6000., 7000., 10000.], }]
322    pub fn max_bandwidth_mbps() -> Histogram;
323
324    /// The highest momentary loss reported during the lifetime of the
325    /// connection
326    #[ctor = HistogramBuilder { buckets: &[0.0, 0.1, 0.2, 0.5, 1., 2., 3., 4., 5., 10., 15., 20., 25., 50., 100.], }]
327    pub fn max_loss_pct() -> Histogram;
328
329    /// Number of UDP packets dropped when receiving
330    pub fn udp_drop_count() -> Counter;
331
332    /// Number of failed quic handshakes
333    pub fn failed_handshakes(reason: labels::HandshakeError) -> Counter;
334
335    /// Number of HTTP/3 connection closures generated locally
336    pub fn local_h3_conn_close_error_count(reason: labels::H3Error) -> Counter;
337
338    /// Number of QUIC connection closures generated locally
339    pub fn local_quic_conn_close_error_count(
340        reason: labels::QuicError,
341    ) -> Counter;
342
343    /// Number of HTTP/3 connection closures generated by peer
344    pub fn peer_h3_conn_close_error_count(reason: labels::H3Error) -> Counter;
345
346    /// Number of QUIC connection closures generated by peer
347    pub fn peer_quic_conn_close_error_count(reason: labels::QuicError)
348        -> Counter;
349}
350
351#[metrics]
352mod tokio {
353    /// Histogram of task schedule delays
354    #[ctor = HistogramBuilder { buckets: &[0.0, 1E-4, 2E-4, 3E-4, 4E-4, 5E-4, 6E-4, 7E-4, 8E-4, 9E-4, 1E-3, 1E-2, 2E-2, 4E-2, 8E-2, 1E-1, 1.0], }]
355    pub fn runtime_task_schedule_delay_histogram(
356        task: &Arc<str>,
357    ) -> TimeHistogram;
358
359    /// Histogram of task poll durations
360    #[ctor = HistogramBuilder { buckets: &[0.0, 1E-4, 2E-4, 3E-4, 4E-4, 5E-4, 6E-4, 7E-4, 8E-4, 9E-4, 1E-3, 1E-2, 2E-2, 4E-2, 8E-2, 1E-1, 1.0], }]
361    pub fn runtime_task_poll_duration_histogram(task: &Arc<str>)
362        -> TimeHistogram;
363
364    /// Helps us get a rough idea of if our waker is causing issues.
365    pub fn runtime_task_total_poll_time_micros(task: &Arc<str>) -> Counter;
366}
367
368pub(crate) fn quic_expensive_metrics_ip_reduce(ip: IpAddr) -> Option<IpAddr> {
369    const QUIC_INITIAL_METRICS_V4_PREFIX: u8 = 20;
370    const QUIC_INITIAL_METRICS_V6_PREFIX: u8 = 32;
371
372    let prefix = if ip.is_ipv4() {
373        QUIC_INITIAL_METRICS_V4_PREFIX
374    } else {
375        QUIC_INITIAL_METRICS_V6_PREFIX
376    };
377
378    if let Ok(ip_net) = ipnetwork::IpNetwork::new(ip, prefix) {
379        Some(ip_net.network())
380    } else {
381        None
382    }
383}