tokio_quiche/socket/
capabilities.rs

1// Copyright (C) 2025, Cloudflare, Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright notice,
9//       this list of conditions and the following disclaimer.
10//
11//     * Redistributions in binary form must reproduce the above copyright
12//       notice, this list of conditions and the following disclaimer in the
13//       documentation and/or other materials provided with the distribution.
14//
15// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
16// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
17// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
19// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27#[cfg(target_os = "linux")]
28mod linux_imports {
29    pub use libc::c_int;
30    pub use libc::c_void;
31    pub use libc::sock_txtime;
32    pub use libc::socklen_t;
33    pub use libc::IPPROTO_IP;
34    pub use libc::IPPROTO_IPV6;
35    pub use libc::IPV6_MTU_DISCOVER;
36    pub use libc::IPV6_PMTUDISC_PROBE;
37    pub use libc::IP_MTU_DISCOVER;
38    pub use libc::IP_PMTUDISC_PROBE;
39    pub use libc::SOL_SOCKET;
40    pub use libc::SO_RCVMARK;
41    pub use nix::errno::Errno;
42    pub use nix::sys::socket::getsockopt;
43    pub use nix::sys::socket::setsockopt;
44    pub use nix::sys::socket::sockopt::IpFreebind;
45    pub use nix::sys::socket::sockopt::IpTransparent;
46    pub use nix::sys::socket::sockopt::Ipv4OrigDstAddr;
47    pub use nix::sys::socket::sockopt::Ipv4PacketInfo;
48    pub use nix::sys::socket::sockopt::Ipv6OrigDstAddr;
49    pub use nix::sys::socket::sockopt::Ipv6RecvPacketInfo;
50    #[cfg(feature = "perf-quic-listener-metrics")]
51    pub use nix::sys::socket::sockopt::ReceiveTimestampns;
52    pub use nix::sys::socket::sockopt::RxqOvfl;
53    pub use nix::sys::socket::sockopt::TxTime;
54    pub use nix::sys::socket::sockopt::UdpGroSegment;
55    pub use nix::sys::socket::sockopt::UdpGsoSegment;
56    pub use nix::sys::socket::SetSockOpt;
57    pub use std::io;
58    pub use std::os::fd::AsFd;
59    pub use std::os::fd::AsRawFd;
60    pub use std::os::fd::BorrowedFd;
61}
62
63#[cfg(target_os = "linux")]
64use linux_imports::*;
65
66#[cfg(target_os = "linux")]
67#[derive(Clone)]
68struct IpMtuDiscoverProbe;
69
70#[cfg(target_os = "linux")]
71impl SetSockOpt for IpMtuDiscoverProbe {
72    type Val = ();
73
74    fn set<F: AsFd>(&self, fd: &F, _val: &Self::Val) -> nix::Result<()> {
75        let pmtud_mode: c_int = IP_PMTUDISC_PROBE;
76        let ret = unsafe {
77            libc::setsockopt(
78                fd.as_fd().as_raw_fd(),
79                IPPROTO_IP,
80                IP_MTU_DISCOVER,
81                &pmtud_mode as *const c_int as *const c_void,
82                std::mem::size_of::<c_int>() as socklen_t,
83            )
84        };
85
86        match ret {
87            0 => Ok(()),
88            _ => Err(Errno::last()),
89        }
90    }
91}
92
93#[cfg(target_os = "linux")]
94#[derive(Clone)]
95struct Ipv6MtuDiscoverProbe;
96
97#[cfg(target_os = "linux")]
98impl SetSockOpt for Ipv6MtuDiscoverProbe {
99    type Val = ();
100
101    fn set<F: AsFd>(&self, fd: &F, _val: &Self::Val) -> nix::Result<()> {
102        let pmtud_mode: c_int = IPV6_PMTUDISC_PROBE;
103        let ret = unsafe {
104            libc::setsockopt(
105                fd.as_fd().as_raw_fd(),
106                IPPROTO_IPV6,
107                IPV6_MTU_DISCOVER,
108                &pmtud_mode as *const c_int as *const c_void,
109                std::mem::size_of::<c_int>() as socklen_t,
110            )
111        };
112
113        match ret {
114            0 => Ok(()),
115            _ => Err(Errno::last()),
116        }
117    }
118}
119
120#[cfg(target_os = "linux")]
121#[derive(Clone)]
122struct RcvMark;
123
124#[cfg(target_os = "linux")]
125impl SetSockOpt for RcvMark {
126    type Val = ();
127
128    fn set<F: AsFd>(&self, fd: &F, _val: &Self::Val) -> nix::Result<()> {
129        // https://elixir.bootlin.com/linux/v6.17/source/net/core/sock.c#L1523
130        const ENABLE_SOCKOPT: i32 = 1;
131
132        let ret = unsafe {
133            libc::setsockopt(
134                fd.as_fd().as_raw_fd(),
135                SOL_SOCKET,
136                SO_RCVMARK,
137                &ENABLE_SOCKOPT as *const c_int as *const c_void,
138                std::mem::size_of::<c_int>() as socklen_t,
139            )
140        };
141
142        match ret {
143            0 => Ok(()),
144            _ => Err(Errno::last()),
145        }
146    }
147}
148
149/// Builder to enable Linux sockopts which improve QUIC performance.
150#[cfg(target_os = "linux")]
151pub struct SocketCapabilitiesBuilder<'s> {
152    socket: BorrowedFd<'s>,
153    cap: SocketCapabilities,
154}
155
156#[cfg(target_os = "linux")]
157impl<'s> SocketCapabilitiesBuilder<'s> {
158    /// Creates a new sockopt builder for `socket`.
159    pub fn new<S: AsFd>(socket: &'s S) -> Self {
160        Self {
161            socket: socket.as_fd(),
162            cap: Default::default(),
163        }
164    }
165
166    /// Enables [`UDP_SEGMENT`](https://man7.org/linux/man-pages/man7/udp.7.html),
167    /// a generic segmentation offload (GSO).
168    ///
169    /// GSO improves transmit performance by treating multiple sequential UDP
170    /// packets as a single entity in the kernel. Segmentation into
171    /// individual packets happens in the NIC, if it supports GSO. The
172    /// parameter specifies the packet size.
173    pub fn gso(&mut self) -> io::Result<()> {
174        // We initialize GSO on the socket with the maximum possible segment size
175        // to prevent accidentally setting it too small and running into
176        // issues when increasing max_send_udp_payload_size later on.
177        //
178        // https://elixir.bootlin.com/linux/v6.14.6/source/net/ipv4/udp.c#L2998
179        // https://elixir.bootlin.com/linux/v6.14.6/source/include/vdso/limits.h#L5
180        setsockopt(&self.socket.as_fd(), UdpGsoSegment, &(u16::MAX as i32))?;
181        self.cap.has_gso = true;
182        Ok(())
183    }
184
185    /// Enables [`SO_RXQ_OVFL`](https://man7.org/linux/man-pages/man7/socket.7.html),
186    /// which reports dropped packets due to insufficient buffer space.
187    pub fn check_udp_drop(&mut self) -> io::Result<()> {
188        setsockopt(&self.socket.as_fd(), RxqOvfl, &1)?;
189
190        self.cap.check_udp_drop = true;
191        Ok(())
192    }
193
194    /// Enables [`SO_TXTIME`](https://man7.org/linux/man-pages/man8/tc-etf.8.html)
195    /// to control packet transmit timestamps for QUIC pacing.
196    pub fn txtime(&mut self) -> io::Result<()> {
197        let cfg = sock_txtime {
198            clockid: libc::CLOCK_MONOTONIC,
199            flags: 0,
200        };
201        setsockopt(&self.socket.as_fd(), TxTime, &cfg)?;
202
203        self.cap.has_txtime = true;
204        Ok(())
205    }
206
207    /// Enables [`SO_TIMESTAMPNS`](https://man7.org/linux/man-pages/man7/socket.7.html),
208    /// which records a wall-clock timestamp for each received packet.
209    #[cfg(feature = "perf-quic-listener-metrics")]
210    pub fn rxtime(&mut self) -> io::Result<()> {
211        setsockopt(&self.socket.as_fd(), ReceiveTimestampns, &true)?;
212
213        self.cap.has_rxtime = true;
214        Ok(())
215    }
216
217    /// Enables [`UDP_GRO`](https://man7.org/linux/man-pages/man7/udp.7.html),
218    /// a generic receive offload (GRO).
219    ///
220    /// GRO improves receive performance by allowing the kernel to yield
221    /// multiple UDP packets in one [`recvmsg(2)`](https://man7.org/linux/man-pages/man2/recv.2.html)
222    /// call. It is the equivalent of GSO for the receive path.
223    pub fn gro(&mut self) -> io::Result<()> {
224        UdpGroSegment.set(&self.socket.as_fd(), &true)?;
225
226        self.cap.has_gro = true;
227        Ok(())
228    }
229
230    /// Enables [`IP_PKTINFO`](https://man7.org/linux/man-pages/man7/ip.7.html)
231    /// to control the source IP in outbound IPv4 packets.
232    pub fn ipv4_pktinfo(&mut self) -> io::Result<()> {
233        setsockopt(&self.socket.as_fd(), Ipv4PacketInfo, &true)?;
234
235        self.cap.has_ippktinfo = true;
236        Ok(())
237    }
238
239    /// Enables [`IP_RECVORIGDSTADDR`](https://man7.org/linux/man-pages/man7/ip.7.html),
240    /// which reports each packet's real IPv4 destination address.
241    ///
242    /// This can be different from the socket's local address due to netfilter
243    /// TPROXY rules or eBPF redirects.
244    pub fn ipv4_recvorigdstaddr(&mut self) -> io::Result<()> {
245        setsockopt(&self.socket.as_fd(), Ipv4OrigDstAddr, &true)?;
246
247        self.cap.has_iprecvorigdstaddr = true;
248        Ok(())
249    }
250
251    /// Enables [`IPV6_RECVPKTINFO`](https://man7.org/linux/man-pages/man7/ipv6.7.html)
252    /// to control the source IP in outbound IPv6 packets.
253    pub fn ipv6_pktinfo(&mut self) -> io::Result<()> {
254        setsockopt(&self.socket.as_fd(), Ipv6RecvPacketInfo, &true)?;
255
256        self.cap.has_ipv6pktinfo = true;
257        Ok(())
258    }
259
260    /// Enables [`IPV6_RECVORIGDSTADDR`](https://elixir.bootlin.com/linux/v6.12/source/net/ipv6/datagram.c#L722-L743),
261    /// which reports each packet's real IPv6 destination address.
262    ///
263    /// This can be different from the socket's local address due to netfilter
264    /// TPROXY rules or eBPF redirects.
265    pub fn ipv6_recvorigdstaddr(&mut self) -> io::Result<()> {
266        setsockopt(&self.socket.as_fd(), Ipv6OrigDstAddr, &true)?;
267
268        self.cap.has_ipv6recvorigdstaddr = true;
269        Ok(())
270    }
271
272    /// Sets [`IP_MTU_DISCOVER`](https://man7.org/linux/man-pages/man7/ip.7.html), to
273    /// `IP_PMTUDISC_PROBE`, which disables kernel PMTUD and sets the `DF`
274    /// (Don't Fragment) flag.
275    pub fn ip_mtu_discover_probe(&mut self) -> io::Result<()> {
276        setsockopt(&self.socket.as_fd(), IpMtuDiscoverProbe, &())?;
277
278        self.cap.has_ip_mtu_discover_probe = true;
279        Ok(())
280    }
281
282    /// Sets [`IPV6_MTU_DISCOVER`](https://man7.org/linux/man-pages/man7/ipv6.7.html), to
283    /// `IPV6_PMTUDISC_PROBE`, which disables kernel PMTUD and sets the `DF`
284    /// (Don't Fragment) flag.
285    pub fn ipv6_mtu_discover_probe(&mut self) -> io::Result<()> {
286        setsockopt(&self.socket.as_fd(), Ipv6MtuDiscoverProbe, &())?;
287
288        self.cap.has_ipv6_mtu_discover_probe = true;
289        Ok(())
290    }
291
292    /// Tests whether [`IP_FREEBIND`](https://man7.org/linux/man-pages/man7/ip.7.html)
293    /// or [`IP_TRANSPARENT`](https://man7.org/linux/man-pages/man7/ip.7.html) are
294    /// enabled for this socket.
295    ///
296    /// # Warning
297    /// These sockopts require elevated permissions to enable, so the builder
298    /// will only check their status. **If neither of them is enabled, the
299    /// `PKTINFO` sockopts will cause errors when sending packets.**
300    pub fn allows_nonlocal_source(&self) -> io::Result<bool> {
301        Ok(getsockopt(&self.socket.as_fd(), IpFreebind)? ||
302            getsockopt(&self.socket.as_fd(), IpTransparent)?)
303    }
304
305    pub fn rcvmark(&mut self) -> io::Result<()> {
306        setsockopt(&self.socket.as_fd(), RcvMark, &())?;
307
308        self.cap.has_mark = true;
309        Ok(())
310    }
311
312    /// Consumes the builder and returns the configured [`SocketCapabilities`].
313    pub fn finish(self) -> SocketCapabilities {
314        self.cap
315    }
316}
317
318// TODO(erittenhouse): use `dgram`'s SocketCapabilities when we migrate over
319#[cfg_attr(not(target_os = "linux"), expect(rustdoc::broken_intra_doc_links))]
320/// Indicators of sockopts configured for a socket.
321///
322/// On Linux, a socket can be configured using a [`SocketCapabilitiesBuilder`],
323/// which returns the sockopts that were applied successfully. By default, all
324/// options are assumed to be disabled (including on OSes besides Linux).
325///
326/// As a shortcut, you may call `apply_all_and_get_compatibility` to apply the
327/// maxmimum set of capabilities supported by this crate. The result will
328/// indicate which options were actually enabled.
329#[derive(Debug, Default)]
330pub struct SocketCapabilities {
331    /// Indicates if the socket has `UDP_SEGMENT` enabled.
332    pub(crate) has_gso: bool,
333
334    /// Indicates if the socket has `SO_RXQ_OVFL` set.
335    // NOTE: RX-side sockopts are `expect(dead_code)` because we check for
336    // received cmsgs directly
337    #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
338    pub(crate) check_udp_drop: bool,
339
340    /// Indicates if the socket was configured with `SO_TXTIME`.
341    pub(crate) has_txtime: bool,
342
343    /// Indicates if the socket has `SO_TIMESTAMPNS` enabled.
344    #[cfg_attr(
345        not(all(target_os = "linux", feature = "perf-quic-listener-metrics")),
346        expect(dead_code)
347    )]
348    pub(crate) has_rxtime: bool,
349
350    /// Indicates if the socket has `UDP_GRO` enabled.
351    #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
352    pub(crate) has_gro: bool,
353
354    /// Indicates if the socket has `IP_PKTINFO` set.
355    pub(crate) has_ippktinfo: bool,
356
357    /// Indicates if the socket has `IP_RECVORIGDSTADDR` set.
358    #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
359    pub(crate) has_iprecvorigdstaddr: bool,
360
361    /// Indicates if the socket has `IPV6_RECVPKTINFO` set.
362    pub(crate) has_ipv6pktinfo: bool,
363
364    /// Indicates if the socket has `IPV6_RECVORIGDSTADDR` set.
365    #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
366    pub(crate) has_ipv6recvorigdstaddr: bool,
367
368    // Indicates if the socket has `IP_MTU_DISCOVER` set to `IP_PMTUDISC_PROBE`.
369    #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
370    pub(crate) has_ip_mtu_discover_probe: bool,
371
372    // Indicates if the socket has `IPV6_MTU_DISCOVER` set to
373    // `IPV6_PMTUDISC_PROBE`.
374    #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
375    pub(crate) has_ipv6_mtu_discover_probe: bool,
376
377    /// Indicates if the socket is set to receive `SO_MARK` messages via
378    /// `SO_RCVMARK`.
379    #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
380    pub(crate) has_mark: bool,
381}
382
383impl SocketCapabilities {
384    /// Tries to enable all supported sockopts and returns indicators
385    /// of which settings were successfully applied.
386    #[cfg(target_os = "linux")]
387    pub fn apply_all_and_get_compatibility<S>(socket: &S) -> Self
388    where
389        S: AsFd,
390    {
391        let mut b = SocketCapabilitiesBuilder::new(socket);
392        let _ = b.gso();
393        let _ = b.check_udp_drop();
394        let _ = b.txtime();
395        #[cfg(feature = "perf-quic-listener-metrics")]
396        let _ = b.rxtime();
397        let _ = b.gro();
398        let _ = b.rcvmark();
399
400        // We can't determine if this is an IPv4 or IPv6 socket, so try setting
401        // the relevant options for both
402        let _ = b.ip_mtu_discover_probe();
403        let _ = b.ipv6_mtu_discover_probe();
404        if let Ok(true) = b.allows_nonlocal_source() {
405            let _ = b.ipv4_pktinfo();
406            let _ = b.ipv4_recvorigdstaddr();
407            let _ = b.ipv6_pktinfo();
408            let _ = b.ipv6_recvorigdstaddr();
409        }
410        b.finish()
411    }
412}