tokio_quiche/socket/capabilities.rs
1// Copyright (C) 2025, Cloudflare, Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8// * Redistributions of source code must retain the above copyright notice,
9// this list of conditions and the following disclaimer.
10//
11// * Redistributions in binary form must reproduce the above copyright
12// notice, this list of conditions and the following disclaimer in the
13// documentation and/or other materials provided with the distribution.
14//
15// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
16// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
17// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
19// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27#[cfg(target_os = "linux")]
28mod linux_imports {
29 pub use libc::c_int;
30 pub use libc::c_void;
31 pub use libc::sock_txtime;
32 pub use libc::socklen_t;
33 pub use libc::IPPROTO_IP;
34 pub use libc::IPPROTO_IPV6;
35 pub use libc::IPV6_MTU_DISCOVER;
36 pub use libc::IPV6_PMTUDISC_PROBE;
37 pub use libc::IP_MTU_DISCOVER;
38 pub use libc::IP_PMTUDISC_PROBE;
39 pub use nix::errno::Errno;
40 pub use nix::sys::socket::getsockopt;
41 pub use nix::sys::socket::setsockopt;
42 pub use nix::sys::socket::sockopt::IpFreebind;
43 pub use nix::sys::socket::sockopt::IpTransparent;
44 pub use nix::sys::socket::sockopt::Ipv4OrigDstAddr;
45 pub use nix::sys::socket::sockopt::Ipv4PacketInfo;
46 pub use nix::sys::socket::sockopt::Ipv6OrigDstAddr;
47 pub use nix::sys::socket::sockopt::Ipv6RecvPacketInfo;
48 #[cfg(feature = "perf-quic-listener-metrics")]
49 pub use nix::sys::socket::sockopt::ReceiveTimestampns;
50 pub use nix::sys::socket::sockopt::RxqOvfl;
51 pub use nix::sys::socket::sockopt::TxTime;
52 pub use nix::sys::socket::sockopt::UdpGroSegment;
53 pub use nix::sys::socket::sockopt::UdpGsoSegment;
54 pub use nix::sys::socket::SetSockOpt;
55 pub use std::io;
56 pub use std::os::fd::AsFd;
57 pub use std::os::fd::AsRawFd as _;
58 pub use std::os::fd::BorrowedFd;
59 pub use std::os::fd::RawFd;
60}
61
62#[cfg(target_os = "linux")]
63use linux_imports::*;
64
65#[cfg(target_os = "linux")]
66#[derive(Clone)]
67struct IpMtuDiscoverProbe;
68
69#[cfg(target_os = "linux")]
70impl SetSockOpt for IpMtuDiscoverProbe {
71 type Val = ();
72
73 fn set(&self, fd: RawFd, _val: &Self::Val) -> nix::Result<()> {
74 let pmtud_mode: c_int = IP_PMTUDISC_PROBE;
75 let ret = unsafe {
76 libc::setsockopt(
77 fd,
78 IPPROTO_IP,
79 IP_MTU_DISCOVER,
80 &pmtud_mode as *const c_int as *const c_void,
81 std::mem::size_of::<c_int>() as socklen_t,
82 )
83 };
84
85 match ret {
86 0 => Ok(()),
87 _ => Err(Errno::last()),
88 }
89 }
90}
91
92#[cfg(target_os = "linux")]
93#[derive(Clone)]
94struct Ipv6MtuDiscoverProbe;
95
96#[cfg(target_os = "linux")]
97impl SetSockOpt for Ipv6MtuDiscoverProbe {
98 type Val = ();
99
100 fn set(&self, fd: RawFd, _val: &Self::Val) -> nix::Result<()> {
101 let pmtud_mode: c_int = IPV6_PMTUDISC_PROBE;
102 let ret = unsafe {
103 libc::setsockopt(
104 fd,
105 IPPROTO_IPV6,
106 IPV6_MTU_DISCOVER,
107 &pmtud_mode as *const c_int as *const c_void,
108 std::mem::size_of::<c_int>() as socklen_t,
109 )
110 };
111
112 match ret {
113 0 => Ok(()),
114 _ => Err(Errno::last()),
115 }
116 }
117}
118
119/// Builder to enable Linux sockopts which improve QUIC performance.
120#[cfg(target_os = "linux")]
121pub struct SocketCapabilitiesBuilder<'s> {
122 socket: BorrowedFd<'s>,
123 cap: SocketCapabilities,
124}
125
126#[cfg(target_os = "linux")]
127impl<'s> SocketCapabilitiesBuilder<'s> {
128 /// Creates a new sockopt builder for `socket`.
129 pub fn new<S: AsFd>(socket: &'s S) -> Self {
130 Self {
131 socket: socket.as_fd(),
132 cap: Default::default(),
133 }
134 }
135
136 /// Enables [`UDP_SEGMENT`](https://man7.org/linux/man-pages/man7/udp.7.html),
137 /// a generic segmentation offload (GSO).
138 ///
139 /// GSO improves transmit performance by treating multiple sequential UDP
140 /// packets as a single entity in the kernel. Segmentation into
141 /// individual packets happens in the NIC, if it supports GSO. The
142 /// parameter specifies the packet size.
143 pub fn gso(&mut self, max_send_udp_payload_size: usize) -> io::Result<()> {
144 setsockopt(
145 self.socket.as_raw_fd(),
146 UdpGsoSegment,
147 &(max_send_udp_payload_size as i32),
148 )?;
149 self.cap.has_gso = true;
150 Ok(())
151 }
152
153 /// Enables [`SO_RXQ_OVFL`](https://man7.org/linux/man-pages/man7/socket.7.html),
154 /// which reports dropped packets due to insufficient buffer space.
155 pub fn check_udp_drop(&mut self) -> io::Result<()> {
156 setsockopt(self.socket.as_raw_fd(), RxqOvfl, &1)?;
157
158 self.cap.check_udp_drop = true;
159 Ok(())
160 }
161
162 /// Enables [`SO_TXTIME`](https://man7.org/linux/man-pages/man8/tc-etf.8.html)
163 /// to control packet transmit timestamps for QUIC pacing.
164 pub fn txtime(&mut self) -> io::Result<()> {
165 let cfg = sock_txtime {
166 clockid: libc::CLOCK_MONOTONIC,
167 flags: 0,
168 };
169 setsockopt(self.socket.as_raw_fd(), TxTime, &cfg)?;
170
171 self.cap.has_txtime = true;
172 Ok(())
173 }
174
175 /// Enables [`SO_TIMESTAMPNS`](https://man7.org/linux/man-pages/man7/socket.7.html),
176 /// which records a wall-clock timestamp for each received packet.
177 #[cfg(feature = "perf-quic-listener-metrics")]
178 pub fn rxtime(&mut self) -> io::Result<()> {
179 setsockopt(self.socket.as_raw_fd(), ReceiveTimestampns, &true)?;
180
181 self.cap.has_rxtime = true;
182 Ok(())
183 }
184
185 /// Enables [`UDP_GRO`](https://man7.org/linux/man-pages/man7/udp.7.html),
186 /// a generic receive offload (GRO).
187 ///
188 /// GRO improves receive performance by allowing the kernel to yield
189 /// multiple UDP packets in one [`recvmsg(2)`](https://man7.org/linux/man-pages/man2/recv.2.html)
190 /// call. It is the equivalent of GSO for the receive path.
191 pub fn gro(&mut self) -> io::Result<()> {
192 UdpGroSegment.set(self.socket.as_raw_fd(), &true)?;
193
194 self.cap.has_gro = true;
195 Ok(())
196 }
197
198 /// Enables [`IP_PKTINFO`](https://man7.org/linux/man-pages/man7/ip.7.html)
199 /// to control the source IP in outbound IPv4 packets.
200 pub fn ipv4_pktinfo(&mut self) -> io::Result<()> {
201 setsockopt(self.socket.as_raw_fd(), Ipv4PacketInfo, &true)?;
202
203 self.cap.has_ippktinfo = true;
204 Ok(())
205 }
206
207 /// Enables [`IP_RECVORIGDSTADDR`](https://man7.org/linux/man-pages/man7/ip.7.html),
208 /// which reports each packet's real IPv4 destination address.
209 ///
210 /// This can be different from the socket's local address due to netfilter
211 /// TPROXY rules or eBPF redirects.
212 pub fn ipv4_recvorigdstaddr(&mut self) -> io::Result<()> {
213 setsockopt(self.socket.as_raw_fd(), Ipv4OrigDstAddr, &true)?;
214
215 self.cap.has_iprecvorigdstaddr = true;
216 Ok(())
217 }
218
219 /// Enables [`IPV6_RECVPKTINFO`](https://man7.org/linux/man-pages/man7/ipv6.7.html)
220 /// to control the source IP in outbound IPv6 packets.
221 pub fn ipv6_pktinfo(&mut self) -> io::Result<()> {
222 setsockopt(self.socket.as_raw_fd(), Ipv6RecvPacketInfo, &true)?;
223
224 self.cap.has_ipv6pktinfo = true;
225 Ok(())
226 }
227
228 /// Enables [`IPV6_RECVORIGDSTADDR`](https://elixir.bootlin.com/linux/v6.12/source/net/ipv6/datagram.c#L722-L743),
229 /// which reports each packet's real IPv6 destination address.
230 ///
231 /// This can be different from the socket's local address due to netfilter
232 /// TPROXY rules or eBPF redirects.
233 pub fn ipv6_recvorigdstaddr(&mut self) -> io::Result<()> {
234 setsockopt(self.socket.as_raw_fd(), Ipv6OrigDstAddr, &true)?;
235
236 self.cap.has_ipv6recvorigdstaddr = true;
237 Ok(())
238 }
239
240 /// Sets [`IP_MTU_DISCOVER`](https://man7.org/linux/man-pages/man7/ip.7.html), to
241 /// `IP_PMTUDISC_PROBE`, which disables kernel PMTUD and sets the `DF`
242 /// (Don't Fragment) flag.
243 pub fn ip_mtu_discover_probe(&mut self) -> io::Result<()> {
244 setsockopt(self.socket.as_raw_fd(), IpMtuDiscoverProbe, &())?;
245
246 self.cap.has_ip_mtu_discover_probe = true;
247 Ok(())
248 }
249
250 /// Sets [`IPV6_MTU_DISCOVER`](https://man7.org/linux/man-pages/man7/ipv6.7.html), to
251 /// `IPV6_PMTUDISC_PROBE`, which disables kernel PMTUD and sets the `DF`
252 /// (Don't Fragment) flag.
253 pub fn ipv6_mtu_discover_probe(&mut self) -> io::Result<()> {
254 setsockopt(self.socket.as_raw_fd(), Ipv6MtuDiscoverProbe, &())?;
255
256 self.cap.has_ipv6_mtu_discover_probe = true;
257 Ok(())
258 }
259
260 /// Tests whether [`IP_FREEBIND`](https://man7.org/linux/man-pages/man7/ip.7.html)
261 /// or [`IP_TRANSPARENT`](https://man7.org/linux/man-pages/man7/ip.7.html) are
262 /// enabled for this socket.
263 ///
264 /// # Warning
265 /// These sockopts require elevated permissions to enable, so the builder
266 /// will only check their status. **If neither of them is enabled, the
267 /// `PKTINFO` sockopts will cause errors when sending packets.**
268 pub fn allows_nonlocal_source(&self) -> io::Result<bool> {
269 Ok(getsockopt(self.socket.as_raw_fd(), IpFreebind)? ||
270 getsockopt(self.socket.as_raw_fd(), IpTransparent)?)
271 }
272
273 /// Consumes the builder and returns the configured [`SocketCapabilities`].
274 pub fn finish(self) -> SocketCapabilities {
275 self.cap
276 }
277}
278
279// TODO(erittenhouse): use `dgram`'s SocketCapabilities when we migrate over
280#[cfg_attr(not(target_os = "linux"), expect(rustdoc::broken_intra_doc_links))]
281/// Indicators of sockopts configured for a socket.
282///
283/// On Linux, a socket can be configured using a [`SocketCapabilitiesBuilder`],
284/// which returns the sockopts that were applied successfully. By default, all
285/// options are assumed to be disabled (including on OSes besides Linux).
286///
287/// As a shortcut, you may call `apply_all_and_get_compatibility` to apply the
288/// maxmimum set of capabilities supported by this crate. The result will
289/// indicate which options were actually enabled.
290#[derive(Debug, Default)]
291pub struct SocketCapabilities {
292 /// Indicates if the socket has `UDP_SEGMENT` enabled.
293 pub(crate) has_gso: bool,
294
295 /// Indicates if the socket has `SO_RXQ_OVFL` set.
296 // NOTE: RX-side sockopts are `expect(dead_code)` because we check for
297 // received cmsgs directly
298 #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
299 pub(crate) check_udp_drop: bool,
300
301 /// Indicates if the socket was configured with `SO_TXTIME`.
302 pub(crate) has_txtime: bool,
303
304 /// Indicates if the socket has `SO_TIMESTAMPNS` enabled.
305 #[expect(dead_code)]
306 pub(crate) has_rxtime: bool,
307
308 /// Indicates if the socket has `UDP_GRO` enabled.
309 #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
310 pub(crate) has_gro: bool,
311
312 /// Indicates if the socket has `IP_PKTINFO` set.
313 pub(crate) has_ippktinfo: bool,
314
315 /// Indicates if the socket has `IP_RECVORIGDSTADDR` set.
316 #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
317 pub(crate) has_iprecvorigdstaddr: bool,
318
319 /// Indicates if the socket has `IPV6_RECVPKTINFO` set.
320 pub(crate) has_ipv6pktinfo: bool,
321
322 /// Indicates if the socket has `IPV6_RECVORIGDSTADDR` set.
323 #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
324 pub(crate) has_ipv6recvorigdstaddr: bool,
325
326 // Indicates if the socket has `IP_MTU_DISCOVER` set to `IP_PMTUDISC_PROBE`.
327 #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
328 pub(crate) has_ip_mtu_discover_probe: bool,
329
330 // Indicates if the socket has `IPV6_MTU_DISCOVER` set to
331 // `IPV6_PMTUDISC_PROBE`.
332 #[cfg_attr(not(target_os = "linux"), expect(dead_code))]
333 pub(crate) has_ipv6_mtu_discover_probe: bool,
334}
335
336impl SocketCapabilities {
337 /// Tries to enable all supported sockopts and returns indicators
338 /// of which settings were successfully applied.
339 #[cfg(target_os = "linux")]
340 pub fn apply_all_and_get_compatibility<S>(
341 socket: &S, max_send_udp_payload_size: usize,
342 ) -> Self
343 where
344 S: AsFd,
345 {
346 let mut b = SocketCapabilitiesBuilder::new(socket);
347 let _ = b.gso(max_send_udp_payload_size);
348 let _ = b.check_udp_drop();
349 let _ = b.txtime();
350 #[cfg(feature = "perf-quic-listener-metrics")]
351 let _ = b.rxtime();
352 let _ = b.gro();
353
354 // We can't determine if this is an IPv4 or IPv6 socket, so try setting
355 // the relevant options for both
356 let _ = b.ip_mtu_discover_probe();
357 let _ = b.ipv6_mtu_discover_probe();
358 if let Ok(true) = b.allows_nonlocal_source() {
359 let _ = b.ipv4_pktinfo();
360 let _ = b.ipv4_recvorigdstaddr();
361 let _ = b.ipv6_pktinfo();
362 let _ = b.ipv6_recvorigdstaddr();
363 }
364 b.finish()
365 }
366}