tokio_quiche/socket/
capabilities.rs

1// Copyright (C) 2025, Cloudflare, Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright notice,
9//       this list of conditions and the following disclaimer.
10//
11//     * Redistributions in binary form must reproduce the above copyright
12//       notice, this list of conditions and the following disclaimer in the
13//       documentation and/or other materials provided with the distribution.
14//
15// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
16// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
17// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
19// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27#[cfg(target_os = "linux")]
28mod linux_imports {
29    pub use libc::c_int;
30    pub use libc::c_void;
31    pub use libc::sock_txtime;
32    pub use libc::socklen_t;
33    pub use libc::IPPROTO_IP;
34    pub use libc::IPPROTO_IPV6;
35    pub use libc::IPV6_MTU_DISCOVER;
36    pub use libc::IPV6_PMTUDISC_PROBE;
37    pub use libc::IP_MTU_DISCOVER;
38    pub use libc::IP_PMTUDISC_PROBE;
39    pub use nix::errno::Errno;
40    pub use nix::sys::socket::getsockopt;
41    pub use nix::sys::socket::setsockopt;
42    pub use nix::sys::socket::sockopt::IpFreebind;
43    pub use nix::sys::socket::sockopt::IpTransparent;
44    pub use nix::sys::socket::sockopt::Ipv4OrigDstAddr;
45    pub use nix::sys::socket::sockopt::Ipv4PacketInfo;
46    pub use nix::sys::socket::sockopt::Ipv6OrigDstAddr;
47    pub use nix::sys::socket::sockopt::Ipv6RecvPacketInfo;
48    #[cfg(feature = "perf-quic-listener-metrics")]
49    pub use nix::sys::socket::sockopt::ReceiveTimestampns;
50    pub use nix::sys::socket::sockopt::RxqOvfl;
51    pub use nix::sys::socket::sockopt::TxTime;
52    pub use nix::sys::socket::sockopt::UdpGroSegment;
53    pub use nix::sys::socket::sockopt::UdpGsoSegment;
54    pub use nix::sys::socket::SetSockOpt;
55    pub use std::io;
56    pub use std::os::fd::AsFd;
57    pub use std::os::fd::AsRawFd as _;
58    pub use std::os::fd::BorrowedFd;
59    pub use std::os::fd::RawFd;
60}
61
62#[cfg(target_os = "linux")]
63use linux_imports::*;
64
65#[cfg(target_os = "linux")]
66#[derive(Clone)]
67struct IpMtuDiscoverProbe;
68
69#[cfg(target_os = "linux")]
70impl SetSockOpt for IpMtuDiscoverProbe {
71    type Val = ();
72
73    fn set(&self, fd: RawFd, _val: &Self::Val) -> nix::Result<()> {
74        let pmtud_mode: c_int = IP_PMTUDISC_PROBE;
75        let ret = unsafe {
76            libc::setsockopt(
77                fd,
78                IPPROTO_IP,
79                IP_MTU_DISCOVER,
80                &pmtud_mode as *const c_int as *const c_void,
81                std::mem::size_of::<c_int>() as socklen_t,
82            )
83        };
84
85        match ret {
86            0 => Ok(()),
87            _ => Err(Errno::last()),
88        }
89    }
90}
91
92#[cfg(target_os = "linux")]
93#[derive(Clone)]
94struct Ipv6MtuDiscoverProbe;
95
96#[cfg(target_os = "linux")]
97impl SetSockOpt for Ipv6MtuDiscoverProbe {
98    type Val = ();
99
100    fn set(&self, fd: RawFd, _val: &Self::Val) -> nix::Result<()> {
101        let pmtud_mode: c_int = IPV6_PMTUDISC_PROBE;
102        let ret = unsafe {
103            libc::setsockopt(
104                fd,
105                IPPROTO_IPV6,
106                IPV6_MTU_DISCOVER,
107                &pmtud_mode as *const c_int as *const c_void,
108                std::mem::size_of::<c_int>() as socklen_t,
109            )
110        };
111
112        match ret {
113            0 => Ok(()),
114            _ => Err(Errno::last()),
115        }
116    }
117}
118
119/// Builder to enable Linux sockopts which improve QUIC performance.
120#[cfg(target_os = "linux")]
121pub struct SocketCapabilitiesBuilder<'s> {
122    socket: BorrowedFd<'s>,
123    cap: SocketCapabilities,
124}
125
126#[cfg(target_os = "linux")]
127impl<'s> SocketCapabilitiesBuilder<'s> {
128    /// Creates a new sockopt builder for `socket`.
129    pub fn new<S: AsFd>(socket: &'s S) -> Self {
130        Self {
131            socket: socket.as_fd(),
132            cap: Default::default(),
133        }
134    }
135
136    /// Enables [`UDP_SEGMENT`](https://man7.org/linux/man-pages/man7/udp.7.html),
137    /// a generic segmentation offload (GSO).
138    ///
139    /// GSO improves transmit performance by treating multiple sequential UDP
140    /// packets as a single entity in the kernel. Segmentation into
141    /// individual packets happens in the NIC, if it supports GSO. The
142    /// parameter specifies the packet size.
143    pub fn gso(&mut self) -> io::Result<()> {
144        // We initialize GSO on the socket with the maximum possible segment size
145        // to prevent accidentally setting it too small and running into
146        // issues when increasing max_send_udp_payload_size later on.
147        //
148        // https://elixir.bootlin.com/linux/v6.14.6/source/net/ipv4/udp.c#L2998
149        // https://elixir.bootlin.com/linux/v6.14.6/source/include/vdso/limits.h#L5
150        setsockopt(self.socket.as_raw_fd(), UdpGsoSegment, &(u16::MAX as i32))?;
151        self.cap.has_gso = true;
152        Ok(())
153    }
154
155    /// Enables [`SO_RXQ_OVFL`](https://man7.org/linux/man-pages/man7/socket.7.html),
156    /// which reports dropped packets due to insufficient buffer space.
157    pub fn check_udp_drop(&mut self) -> io::Result<()> {
158        setsockopt(self.socket.as_raw_fd(), RxqOvfl, &1)?;
159
160        self.cap.check_udp_drop = true;
161        Ok(())
162    }
163
164    /// Enables [`SO_TXTIME`](https://man7.org/linux/man-pages/man8/tc-etf.8.html)
165    /// to control packet transmit timestamps for QUIC pacing.
166    pub fn txtime(&mut self) -> io::Result<()> {
167        let cfg = sock_txtime {
168            clockid: libc::CLOCK_MONOTONIC,
169            flags: 0,
170        };
171        setsockopt(self.socket.as_raw_fd(), TxTime, &cfg)?;
172
173        self.cap.has_txtime = true;
174        Ok(())
175    }
176
177    /// Enables [`SO_TIMESTAMPNS`](https://man7.org/linux/man-pages/man7/socket.7.html),
178    /// which records a wall-clock timestamp for each received packet.
179    #[cfg(feature = "perf-quic-listener-metrics")]
180    pub fn rxtime(&mut self) -> io::Result<()> {
181        setsockopt(self.socket.as_raw_fd(), ReceiveTimestampns, &true)?;
182
183        self.cap.has_rxtime = true;
184        Ok(())
185    }
186
187    /// Enables [`UDP_GRO`](https://man7.org/linux/man-pages/man7/udp.7.html),
188    /// a generic receive offload (GRO).
189    ///
190    /// GRO improves receive performance by allowing the kernel to yield
191    /// multiple UDP packets in one [`recvmsg(2)`](https://man7.org/linux/man-pages/man2/recv.2.html)
192    /// call. It is the equivalent of GSO for the receive path.
193    pub fn gro(&mut self) -> io::Result<()> {
194        UdpGroSegment.set(self.socket.as_raw_fd(), &true)?;
195
196        self.cap.has_gro = true;
197        Ok(())
198    }
199
200    /// Enables [`IP_PKTINFO`](https://man7.org/linux/man-pages/man7/ip.7.html)
201    /// to control the source IP in outbound IPv4 packets.
202    pub fn ipv4_pktinfo(&mut self) -> io::Result<()> {
203        setsockopt(self.socket.as_raw_fd(), Ipv4PacketInfo, &true)?;
204
205        self.cap.has_ippktinfo = true;
206        Ok(())
207    }
208
209    /// Enables [`IP_RECVORIGDSTADDR`](https://man7.org/linux/man-pages/man7/ip.7.html),
210    /// which reports each packet's real IPv4 destination address.
211    ///
212    /// This can be different from the socket's local address due to netfilter
213    /// TPROXY rules or eBPF redirects.
214    pub fn ipv4_recvorigdstaddr(&mut self) -> io::Result<()> {
215        setsockopt(self.socket.as_raw_fd(), Ipv4OrigDstAddr, &true)?;
216
217        self.cap.has_iprecvorigdstaddr = true;
218        Ok(())
219    }
220
221    /// Enables [`IPV6_RECVPKTINFO`](https://man7.org/linux/man-pages/man7/ipv6.7.html)
222    /// to control the source IP in outbound IPv6 packets.
223    pub fn ipv6_pktinfo(&mut self) -> io::Result<()> {
224        setsockopt(self.socket.as_raw_fd(), Ipv6RecvPacketInfo, &true)?;
225
226        self.cap.has_ipv6pktinfo = true;
227        Ok(())
228    }
229
230    /// Enables [`IPV6_RECVORIGDSTADDR`](https://elixir.bootlin.com/linux/v6.12/source/net/ipv6/datagram.c#L722-L743),
231    /// which reports each packet's real IPv6 destination address.
232    ///
233    /// This can be different from the socket's local address due to netfilter
234    /// TPROXY rules or eBPF redirects.
235    pub fn ipv6_recvorigdstaddr(&mut self) -> io::Result<()> {
236        setsockopt(self.socket.as_raw_fd(), Ipv6OrigDstAddr, &true)?;
237
238        self.cap.has_ipv6recvorigdstaddr = true;
239        Ok(())
240    }
241
242    /// Sets [`IP_MTU_DISCOVER`](https://man7.org/linux/man-pages/man7/ip.7.html), to
243    /// `IP_PMTUDISC_PROBE`, which disables kernel PMTUD and sets the `DF`
244    /// (Don't Fragment) flag.
245    pub fn ip_mtu_discover_probe(&mut self) -> io::Result<()> {
246        setsockopt(self.socket.as_raw_fd(), IpMtuDiscoverProbe, &())?;
247
248        self.cap.has_ip_mtu_discover_probe = true;
249        Ok(())
250    }
251
252    /// Sets [`IPV6_MTU_DISCOVER`](https://man7.org/linux/man-pages/man7/ipv6.7.html), to
253    /// `IPV6_PMTUDISC_PROBE`, which disables kernel PMTUD and sets the `DF`
254    /// (Don't Fragment) flag.
255    pub fn ipv6_mtu_discover_probe(&mut self) -> io::Result<()> {
256        setsockopt(self.socket.as_raw_fd(), Ipv6MtuDiscoverProbe, &())?;
257
258        self.cap.has_ipv6_mtu_discover_probe = true;
259        Ok(())
260    }
261
262    /// Tests whether [`IP_FREEBIND`](https://man7.org/linux/man-pages/man7/ip.7.html)
263    /// or [`IP_TRANSPARENT`](https://man7.org/linux/man-pages/man7/ip.7.html) are
264    /// enabled for this socket.
265    ///
266    /// # Warning
267    /// These sockopts require elevated permissions to enable, so the builder
268    /// will only check their status. **If neither of them is enabled, the
269    /// `PKTINFO` sockopts will cause errors when sending packets.**
270    pub fn allows_nonlocal_source(&self) -> io::Result<bool> {
271        Ok(getsockopt(self.socket.as_raw_fd(), IpFreebind)? ||
272            getsockopt(self.socket.as_raw_fd(), IpTransparent)?)
273    }
274
275    /// Consumes the builder and returns the configured [`SocketCapabilities`].
276    pub fn finish(self) -> SocketCapabilities {
277        self.cap
278    }
279}
280
281// TODO(erittenhouse): use `dgram`'s SocketCapabilities when we migrate over
282#[cfg_attr(not(target_os = "linux"), expect(rustdoc::broken_intra_doc_links))]
283/// Indicators of sockopts configured for a socket.
284///
285/// On Linux, a socket can be configured using a [`SocketCapabilitiesBuilder`],
286/// which returns the sockopts that were applied successfully. By default, all
287/// options are assumed to be disabled (including on OSes besides Linux).
288///
289/// As a shortcut, you may call `apply_all_and_get_compatibility` to apply the
290/// maxmimum set of capabilities supported by this crate. The result will
291/// indicate which options were actually enabled.
292#[derive(Debug, Default)]
293pub struct SocketCapabilities {
294    /// Indicates if the socket has `UDP_SEGMENT` enabled.
295    pub(crate) has_gso: bool,
296
297    /// Indicates if the socket has `SO_RXQ_OVFL` set.
298    // NOTE: RX-side sockopts are `expect(dead_code)` because we check for
299    // received cmsgs directly
300    #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
301    pub(crate) check_udp_drop: bool,
302
303    /// Indicates if the socket was configured with `SO_TXTIME`.
304    pub(crate) has_txtime: bool,
305
306    /// Indicates if the socket has `SO_TIMESTAMPNS` enabled.
307    #[cfg_attr(
308        not(all(target_os = "linux", feature = "perf-quic-listener-metrics")),
309        expect(dead_code)
310    )]
311    pub(crate) has_rxtime: bool,
312
313    /// Indicates if the socket has `UDP_GRO` enabled.
314    #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
315    pub(crate) has_gro: bool,
316
317    /// Indicates if the socket has `IP_PKTINFO` set.
318    pub(crate) has_ippktinfo: bool,
319
320    /// Indicates if the socket has `IP_RECVORIGDSTADDR` set.
321    #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
322    pub(crate) has_iprecvorigdstaddr: bool,
323
324    /// Indicates if the socket has `IPV6_RECVPKTINFO` set.
325    pub(crate) has_ipv6pktinfo: bool,
326
327    /// Indicates if the socket has `IPV6_RECVORIGDSTADDR` set.
328    #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
329    pub(crate) has_ipv6recvorigdstaddr: bool,
330
331    // Indicates if the socket has `IP_MTU_DISCOVER` set to `IP_PMTUDISC_PROBE`.
332    #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
333    pub(crate) has_ip_mtu_discover_probe: bool,
334
335    // Indicates if the socket has `IPV6_MTU_DISCOVER` set to
336    // `IPV6_PMTUDISC_PROBE`.
337    #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
338    pub(crate) has_ipv6_mtu_discover_probe: bool,
339}
340
341impl SocketCapabilities {
342    /// Tries to enable all supported sockopts and returns indicators
343    /// of which settings were successfully applied.
344    #[cfg(target_os = "linux")]
345    pub fn apply_all_and_get_compatibility<S>(socket: &S) -> Self
346    where
347        S: AsFd,
348    {
349        let mut b = SocketCapabilitiesBuilder::new(socket);
350        let _ = b.gso();
351        let _ = b.check_udp_drop();
352        let _ = b.txtime();
353        #[cfg(feature = "perf-quic-listener-metrics")]
354        let _ = b.rxtime();
355        let _ = b.gro();
356
357        // We can't determine if this is an IPv4 or IPv6 socket, so try setting
358        // the relevant options for both
359        let _ = b.ip_mtu_discover_probe();
360        let _ = b.ipv6_mtu_discover_probe();
361        if let Ok(true) = b.allows_nonlocal_source() {
362            let _ = b.ipv4_pktinfo();
363            let _ = b.ipv4_recvorigdstaddr();
364            let _ = b.ipv6_pktinfo();
365            let _ = b.ipv6_recvorigdstaddr();
366        }
367        b.finish()
368    }
369}