ab_direct_io_file/
lib.rs

1//! Cross-platform APIs for working with files using direct I/O.
2//!
3//! Depending on OS, this will use direct I/O, unbuffered, uncaches passthrough file reads/writes,
4//! bypassing as much of OS machinery as possible.
5//!
6//! NOTE: There are major alignment requirements described here:
7//! <https://learn.microsoft.com/en-us/windows/win32/fileio/file-buffering#alignment-and-file-access-requirements>
8//! <https://man7.org/linux/man-pages/man2/open.2.html>
9
10// TODO: Windows shims are incomplete under Miri: https://github.com/rust-lang/miri/issues/3482
11#[cfg(all(test, not(all(miri, windows))))]
12mod tests;
13
14use parking_lot::Mutex;
15use std::fs::{File, OpenOptions};
16use std::mem::MaybeUninit;
17use std::path::Path;
18use std::{io, mem, slice};
19
20/// 4096 is as a relatively safe size due to sector size on SSDs commonly being 512 or 4096 bytes
21pub const DISK_PAGE_SIZE: usize = 4096;
22/// Restrict how much data to read from the disk in a single call to avoid very large memory usage
23const MAX_READ_SIZE: usize = 1024 * 1024;
24
25const _: () = {
26    assert!(MAX_READ_SIZE.is_multiple_of(AlignedPage::SIZE));
27};
28
29/// A wrapper data structure with 4096 bytes alignment, which is the most common alignment for
30/// direct I/O operations.
31#[derive(Debug, Copy, Clone)]
32#[repr(C, align(4096))]
33pub struct AlignedPage([u8; AlignedPage::SIZE]);
34
35const _: () = {
36    assert!(align_of::<AlignedPage>() == AlignedPage::SIZE);
37};
38
39impl Default for AlignedPage {
40    #[inline(always)]
41    fn default() -> Self {
42        Self([0; AlignedPage::SIZE])
43    }
44}
45
46impl AlignedPage {
47    /// 4096 is as a relatively safe size due to sector size on SSDs commonly being 512 or 4096
48    /// bytes
49    pub const SIZE: usize = 4096;
50
51    /// Convert an exclusive slice to an uninitialized version
52    pub fn as_uninit_slice_mut(value: &mut [Self]) -> &mut [MaybeUninit<Self>] {
53        // SAFETY: Same layout
54        unsafe { mem::transmute(value) }
55    }
56
57    /// Convenient conversion from slice to underlying representation for efficiency purposes
58    #[inline(always)]
59    pub fn slice_to_repr(value: &[Self]) -> &[[u8; AlignedPage::SIZE]] {
60        // SAFETY: `AlignedPage` is `#[repr(C)]` and guaranteed to have the same memory layout
61        unsafe { mem::transmute(value) }
62    }
63
64    /// Convenient conversion from slice to underlying representation for efficiency purposes
65    #[inline(always)]
66    pub fn uninit_slice_to_repr(
67        value: &[MaybeUninit<Self>],
68    ) -> &[MaybeUninit<[u8; AlignedPage::SIZE]>] {
69        // SAFETY: `AlignedPage` is `#[repr(C)]` and guaranteed to have the same memory layout
70        unsafe { mem::transmute(value) }
71    }
72
73    /// Convenient conversion from a slice of underlying representation for efficiency purposes.
74    ///
75    /// Returns `None` if not correctly aligned.
76    #[inline]
77    pub fn try_slice_from_repr(value: &[[u8; AlignedPage::SIZE]]) -> Option<&[Self]> {
78        // SAFETY: All bit patterns are valid
79        let (before, slice, after) = unsafe { value.align_to::<Self>() };
80
81        if before.is_empty() && after.is_empty() {
82            Some(slice)
83        } else {
84            None
85        }
86    }
87
88    /// Convenient conversion from a slice of underlying representation for efficiency purposes.
89    ///
90    /// Returns `None` if not correctly aligned.
91    #[inline]
92    pub fn try_uninit_slice_from_repr(
93        value: &[MaybeUninit<[u8; AlignedPage::SIZE]>],
94    ) -> Option<&[MaybeUninit<Self>]> {
95        // SAFETY: All bit patterns are valid
96        let (before, slice, after) = unsafe { value.align_to::<MaybeUninit<Self>>() };
97
98        if before.is_empty() && after.is_empty() {
99            Some(slice)
100        } else {
101            None
102        }
103    }
104
105    /// Convenient conversion from mutable slice to underlying representation for efficiency
106    /// purposes
107    #[inline(always)]
108    pub fn slice_mut_to_repr(slice: &mut [Self]) -> &mut [[u8; AlignedPage::SIZE]] {
109        // SAFETY: `AlignedSectorSize` is `#[repr(C)]` and its alignment is larger than inner value
110        unsafe { mem::transmute(slice) }
111    }
112
113    /// Convenient conversion from mutable slice to underlying representation for efficiency
114    /// purposes
115    #[inline(always)]
116    pub fn uninit_slice_mut_to_repr(
117        slice: &mut [MaybeUninit<Self>],
118    ) -> &mut [MaybeUninit<[u8; AlignedPage::SIZE]>] {
119        // SAFETY: `AlignedSectorSize` is `#[repr(C)]` and its alignment is larger than inner value
120        unsafe { mem::transmute(slice) }
121    }
122
123    /// Convenient conversion from a slice of underlying representation for efficiency purposes.
124    ///
125    /// Returns `None` if not correctly aligned.
126    #[inline]
127    pub fn try_slice_mut_from_repr(value: &mut [[u8; AlignedPage::SIZE]]) -> Option<&mut [Self]> {
128        // SAFETY: All bit patterns are valid
129        let (before, slice, after) = unsafe { value.align_to_mut::<Self>() };
130
131        if before.is_empty() && after.is_empty() {
132            Some(slice)
133        } else {
134            None
135        }
136    }
137
138    /// Convenient conversion from a slice of underlying representation for efficiency purposes.
139    ///
140    /// Returns `None` if not correctly aligned.
141    #[inline]
142    pub fn try_uninit_slice_mut_from_repr(
143        value: &mut [MaybeUninit<[u8; AlignedPage::SIZE]>],
144    ) -> Option<&mut [MaybeUninit<Self>]> {
145        // SAFETY: All bit patterns are valid
146        let (before, slice, after) = unsafe { value.align_to_mut::<MaybeUninit<Self>>() };
147
148        if before.is_empty() && after.is_empty() {
149            Some(slice)
150        } else {
151            None
152        }
153    }
154}
155
156/// Wrapper data structure for direct/unbuffered/uncached I/O.
157///
158/// Depending on OS, this will use direct I/O, unbuffered, uncaches passthrough file reads/writes,
159/// bypassing as much of OS machinery as possible.
160///
161/// NOTE: There are major alignment requirements described here:
162/// <https://learn.microsoft.com/en-us/windows/win32/fileio/file-buffering#alignment-and-file-access-requirements>
163/// <https://man7.org/linux/man-pages/man2/open.2.html>
164#[derive(Debug)]
165pub struct DirectIoFile {
166    file: File,
167    /// Scratch buffer of aligned memory for reads and writes
168    scratch_buffer: Mutex<Vec<AlignedPage>>,
169}
170
171impl DirectIoFile {
172    /// Open a file with basic open options at the specified path for direct/unbuffered I/O for
173    /// reads and writes.
174    ///
175    /// `options` allows configuring things like read/write/create/truncate, but custom options
176    /// will be overridden internally.
177    ///
178    /// This is especially important on Windows to prevent huge memory usage.
179    #[inline]
180    pub fn open<P>(
181        #[cfg(any(target_os = "linux", windows))] mut options: OpenOptions,
182        #[cfg(not(any(target_os = "linux", windows)))] options: OpenOptions,
183        path: P,
184    ) -> io::Result<Self>
185    where
186        P: AsRef<Path>,
187    {
188        // Direct I/O on Linux
189        #[cfg(target_os = "linux")]
190        // TODO: Unlock under Miri once supported: https://github.com/rust-lang/miri/issues/4462
191        if !cfg!(miri) {
192            use std::os::unix::fs::OpenOptionsExt;
193
194            options.custom_flags(libc::O_DIRECT);
195        }
196        // Unbuffered write-through on Windows
197        #[cfg(windows)]
198        // TODO: Unlock under Miri once supported: https://github.com/rust-lang/miri/issues/4462
199        if !cfg!(miri) {
200            use std::os::windows::fs::OpenOptionsExt;
201
202            options.custom_flags(
203                windows::Win32::Storage::FileSystem::FILE_FLAG_WRITE_THROUGH.0
204                    | windows::Win32::Storage::FileSystem::FILE_FLAG_NO_BUFFERING.0,
205            );
206        }
207        let file = options.open(path)?;
208
209        // Disable caching on macOS
210        #[cfg(target_os = "macos")]
211        // TODO: Unlock under Miri once supported: https://github.com/rust-lang/miri/issues/4462
212        if !cfg!(miri) {
213            use std::os::unix::io::AsRawFd;
214
215            // SAFETY: FFI call with correct file descriptor and arguments
216            if unsafe { libc::fcntl(file.as_raw_fd(), libc::F_NOCACHE, 1) } != 0 {
217                return Err(io::Error::last_os_error());
218            }
219        }
220
221        Ok(Self {
222            file,
223            // In many cases, we'll want to read this much at once, so pre-allocate it right away
224            scratch_buffer: Mutex::new(vec![
225                AlignedPage::default();
226                MAX_READ_SIZE / AlignedPage::SIZE
227            ]),
228        })
229    }
230
231    /// Get file size
232    #[inline]
233    pub fn len(&self) -> io::Result<u64> {
234        Ok(self.file.metadata()?.len())
235    }
236
237    /// Returns `Ok(true)` if the file is empty
238    #[inline]
239    pub fn is_empty(&self) -> io::Result<bool> {
240        Ok(self.len()? == 0)
241    }
242
243    /// Make sure the file has a specified number of bytes allocated on the disk.
244    ///
245    /// Later writes within `len` will not fail due to lack of disk space.
246    #[inline(always)]
247    pub fn allocate(&self, len: u64) -> io::Result<()> {
248        fs2::FileExt::allocate(&self.file, len)
249    }
250
251    /// Truncates or extends the underlying file, updating the size of this file to become `len`.
252    ///
253    /// Note if `len` is larger than the previous file size, it will result in a sparse file. If
254    /// you'd like to pre-allocate space on disk, use [`Self::allocate()`], which may be followed by
255    /// this method to truncate the file if the new file size is smaller than the previous
256    /// ([`Self::allocate()`] doesn't truncate the file).
257    #[inline(always)]
258    pub fn set_len(&self, len: u64) -> io::Result<()> {
259        self.file.set_len(len)
260    }
261
262    /// Read the exact number of bytes needed to fill `buf` at `offset`.
263    ///
264    /// NOTE: This uses locking and buffering internally, prefer [`Self::write_all_at_raw()`] if you
265    /// can control data alignment.
266    pub fn read_exact_at(&self, buf: &mut [u8], mut offset: u64) -> io::Result<()> {
267        if buf.is_empty() {
268            return Ok(());
269        }
270
271        let mut scratch_buffer = self.scratch_buffer.lock();
272
273        // This is guaranteed by the constructor
274        debug_assert!(
275            AlignedPage::slice_to_repr(&scratch_buffer)
276                .as_flattened()
277                .len()
278                <= MAX_READ_SIZE
279        );
280
281        // First read up to `MAX_READ_SIZE - padding`
282        let padding = (offset % AlignedPage::SIZE as u64) as usize;
283        let first_unaligned_chunk_size = (MAX_READ_SIZE - padding).min(buf.len());
284        let (unaligned_start, buf) = buf.split_at_mut(first_unaligned_chunk_size);
285        {
286            let bytes_to_read = unaligned_start.len();
287            unaligned_start.copy_from_slice(self.read_exact_at_internal(
288                &mut scratch_buffer,
289                bytes_to_read,
290                offset,
291            )?);
292            offset += unaligned_start.len() as u64;
293        }
294
295        if buf.is_empty() {
296            return Ok(());
297        }
298
299        // Process the rest of the chunks, up to `MAX_READ_SIZE` at a time
300        for buf in buf.chunks_mut(MAX_READ_SIZE) {
301            let bytes_to_read = buf.len();
302            buf.copy_from_slice(self.read_exact_at_internal(
303                &mut scratch_buffer,
304                bytes_to_read,
305                offset,
306            )?);
307            offset += buf.len() as u64;
308        }
309
310        Ok(())
311    }
312
313    /// Write all bytes at `buf` at `offset`.
314    ///
315    /// NOTE: This uses locking and buffering internally, prefer [`Self::write_all_at_raw()`] if you
316    /// can control data alignment.
317    pub fn write_all_at(&self, buf: &[u8], mut offset: u64) -> io::Result<()> {
318        if buf.is_empty() {
319            return Ok(());
320        }
321
322        let mut scratch_buffer = self.scratch_buffer.lock();
323
324        // This is guaranteed by the constructor
325        debug_assert!(
326            AlignedPage::slice_to_repr(&scratch_buffer)
327                .as_flattened()
328                .len()
329                <= MAX_READ_SIZE
330        );
331
332        // First, write up to `MAX_READ_SIZE - padding`
333        let padding = (offset % AlignedPage::SIZE as u64) as usize;
334        let first_unaligned_chunk_size = (MAX_READ_SIZE - padding).min(buf.len());
335        let (unaligned_start, buf) = buf.split_at(first_unaligned_chunk_size);
336        {
337            self.write_all_at_internal(&mut scratch_buffer, unaligned_start, offset)?;
338            offset += unaligned_start.len() as u64;
339        }
340
341        if buf.is_empty() {
342            return Ok(());
343        }
344
345        // Process the rest of the chunks, up to `MAX_READ_SIZE` at a time
346        for buf in buf.chunks(MAX_READ_SIZE) {
347            self.write_all_at_internal(&mut scratch_buffer, buf, offset)?;
348            offset += buf.len() as u64;
349        }
350
351        Ok(())
352    }
353
354    /// Low-level reading into aligned memory.
355    ///
356    /// `offset` needs to be page-aligned as well or use [`Self::read_exact_at()`] if you're willing
357    /// to pay for the corresponding overhead.
358    ///
359    /// Successful result guarantees that all bytes in `buf` were written.
360    #[inline]
361    pub fn read_exact_at_raw(
362        &self,
363        buf: &mut [MaybeUninit<AlignedPage>],
364        offset: u64,
365    ) -> io::Result<()> {
366        let buf = AlignedPage::uninit_slice_mut_to_repr(buf);
367
368        // TODO: Switch to APIs from https://github.com/rust-lang/rust/issues/140771 once
369        //  implementation lands in nightly
370        // SAFETY: `buf` is never read by Rust internal API, only written to
371        let buf = unsafe {
372            slice::from_raw_parts_mut(
373                buf.as_mut_ptr().cast::<[u8; AlignedPage::SIZE]>(),
374                buf.len(),
375            )
376        };
377
378        let buf = buf.as_flattened_mut();
379
380        #[cfg(unix)]
381        {
382            use std::os::unix::fs::FileExt;
383
384            self.file.read_exact_at(buf, offset)
385        }
386        #[cfg(windows)]
387        {
388            use std::os::windows::fs::FileExt;
389
390            let mut buf = buf;
391            let mut offset = offset;
392            while !buf.is_empty() {
393                match self.file.seek_read(buf, offset) {
394                    Ok(0) => {
395                        break;
396                    }
397                    Ok(n) => {
398                        buf = &mut buf[n..];
399                        offset += n as u64;
400                    }
401                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {
402                        // Try again
403                    }
404                    Err(e) => {
405                        return Err(e);
406                    }
407                }
408            }
409
410            if !buf.is_empty() {
411                Err(io::Error::new(
412                    io::ErrorKind::UnexpectedEof,
413                    "failed to fill the whole buffer",
414                ))
415            } else {
416                Ok(())
417            }
418        }
419    }
420
421    /// Low-level writing from aligned memory.
422    ///
423    /// `offset` needs to be page-aligned as well or use [`Self::write_all_at()`] if you're willing
424    /// to pay for the corresponding overhead.
425    #[inline]
426    pub fn write_all_at_raw(&self, buf: &[AlignedPage], offset: u64) -> io::Result<()> {
427        let buf = AlignedPage::slice_to_repr(buf).as_flattened();
428
429        #[cfg(unix)]
430        {
431            use std::os::unix::fs::FileExt;
432
433            self.file.write_all_at(buf, offset)
434        }
435        #[cfg(windows)]
436        {
437            use std::os::windows::fs::FileExt;
438
439            let mut buf = buf;
440            let mut offset = offset;
441            while !buf.is_empty() {
442                match self.file.seek_write(buf, offset) {
443                    Ok(0) => {
444                        return Err(io::Error::new(
445                            io::ErrorKind::WriteZero,
446                            "failed to write the whole buffer",
447                        ));
448                    }
449                    Ok(n) => {
450                        buf = &buf[n..];
451                        offset += n as u64;
452                    }
453                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {
454                        // Try again
455                    }
456                    Err(e) => {
457                        return Err(e);
458                    }
459                }
460            }
461
462            Ok(())
463        }
464    }
465
466    /// Access internal [`File`] instance
467    #[inline(always)]
468    pub fn file(&self) -> &File {
469        &self.file
470    }
471
472    fn read_exact_at_internal<'a>(
473        &self,
474        scratch_buffer: &'a mut [AlignedPage],
475        bytes_to_read: usize,
476        offset: u64,
477    ) -> io::Result<&'a [u8]> {
478        let page_aligned_offset = offset / AlignedPage::SIZE as u64 * AlignedPage::SIZE as u64;
479        let padding = (offset - page_aligned_offset) as usize;
480
481        // Make a scratch buffer of a size that is necessary to read aligned memory, accounting
482        // for extra bytes at the beginning and the end that will be thrown away
483        let pages_to_read = (padding + bytes_to_read).div_ceil(AlignedPage::SIZE);
484        let scratch_buffer = &mut scratch_buffer[..pages_to_read];
485
486        self.read_exact_at_raw(
487            AlignedPage::as_uninit_slice_mut(scratch_buffer),
488            page_aligned_offset,
489        )?;
490
491        Ok(&AlignedPage::slice_to_repr(scratch_buffer).as_flattened()[padding..][..bytes_to_read])
492    }
493
494    /// Panics on writes over `MAX_READ_SIZE` (including padding on both ends)
495    fn write_all_at_internal(
496        &self,
497        scratch_buffer: &mut [AlignedPage],
498        bytes_to_write: &[u8],
499        offset: u64,
500    ) -> io::Result<()> {
501        let page_aligned_offset = offset / AlignedPage::SIZE as u64 * AlignedPage::SIZE as u64;
502        let padding = (offset - page_aligned_offset) as usize;
503
504        // Calculate the size of the read including padding on both ends
505        let pages_to_read = (padding + bytes_to_write.len()).div_ceil(AlignedPage::SIZE);
506
507        if padding == 0 && pages_to_read == bytes_to_write.len() {
508            let scratch_buffer = &mut scratch_buffer[..pages_to_read];
509            AlignedPage::slice_mut_to_repr(scratch_buffer)
510                .as_flattened_mut()
511                .copy_from_slice(bytes_to_write);
512            self.write_all_at_raw(scratch_buffer, offset)?;
513        } else {
514            let scratch_buffer = &mut scratch_buffer[..pages_to_read];
515            // Read whole pages where `bytes_to_write` will be written
516            self.read_exact_at_raw(
517                AlignedPage::as_uninit_slice_mut(scratch_buffer),
518                page_aligned_offset,
519            )?;
520            // Update the contents of existing pages and write into the file
521            AlignedPage::slice_mut_to_repr(scratch_buffer).as_flattened_mut()[padding..]
522                [..bytes_to_write.len()]
523                .copy_from_slice(bytes_to_write);
524            self.write_all_at_raw(scratch_buffer, page_aligned_offset)?;
525        }
526
527        Ok(())
528    }
529}