Skip to main content

ab_riscv_interpreter/v/zvexx/load/
zvexx_load_helpers.rs

1//! Opaque helpers for ZveXx extension
2
3use crate::v::vector_registers::{VectorRegisterFile, VectorRegistersExt};
4use crate::v::zvexx::zvexx_helpers::INSTRUCTION_SIZE;
5use crate::{ExecutionError, ProgramCounter, VirtualMemory, VirtualMemoryError};
6use ab_riscv_primitives::prelude::*;
7use core::cmp::Ordering;
8use core::fmt;
9
10/// Return whether mask bit `i` is set in the mask byte slice.
11///
12/// Bits are stored LSB-first within each byte: bit `i` is at byte `i / 8`, position `i % 8`.
13/// Returns `false` for any `i` outside the slice bounds.
14#[inline(always)]
15pub(in super::super) fn mask_bit(mask: &[u8], i: u32) -> bool {
16    mask.get((i / u8::BITS) as usize)
17        .is_some_and(|b| (b >> (i % u8::BITS)) & 1 != 0)
18}
19
20/// Copy the mask bytes needed to cover `vl` elements from `v0` into a stack buffer and return
21/// it. The copy releases the shared borrow on the register file so the caller can immediately
22/// take an exclusive borrow for writes.
23///
24/// When `vm=true` (unmasked), the buffer is filled with `0xff` so that every mask bit reads as `1`.
25/// This means callers can unconditionally call [`mask_bit()`] on the returned buffer without
26/// branching on `vm`. Current callers short-circuit with `!vm &&` before calling [`mask_bit()`] as
27/// a micro-optimization on the common unmasked path, but correctness does not depend on that guard:
28/// if it were removed, the `0xff` fill ensures [`mask_bit()`] would return `true` for every
29/// element, preserving the unmasked semantics.
30///
31/// # Safety
32/// `vl.div_ceil(8)` must be `<= VLENB`. This holds when `vl <= VLEN`, which is always true
33/// when `vl` is the current architectural `vl` (bounded by `VLMAX <= VLEN`).
34#[inline(always)]
35pub(in super::super) unsafe fn snapshot_mask<const VLENB: usize>(
36    vregs: &VectorRegisterFile<VLENB>,
37    vm: bool,
38    vl: u32,
39) -> [u8; VLENB] {
40    let mut buf = [0u8; VLENB];
41    if vm {
42        // All-ones: every element active
43        buf = [0xffu8; VLENB];
44    } else {
45        let mask_bytes = vl.div_ceil(u8::BITS) as usize;
46        // SAFETY: `mask_bytes <= VLENB` by the caller's precondition
47        unsafe {
48            buf.get_unchecked_mut(..mask_bytes)
49                .copy_from_slice(vregs.get(VReg::V0).get_unchecked(..mask_bytes));
50        }
51    }
52    buf
53}
54
55/// Return whether register groups `[a, a+a_regs)` and `[b, b+b_regs)` overlap.
56#[inline(always)]
57#[doc(hidden)]
58pub fn groups_overlap(a: VReg, a_regs: u8, b: VReg, b_regs: u8) -> bool {
59    let (a, b) = (a.to_bits(), b.to_bits());
60    a < b + b_regs && b < a + a_regs
61}
62
63/// Return whether a *non-segment* indexed load's data destination group
64/// `[vd, vd + data_regs)` may legally overlap its index source group `[vs2, vs2 + index_regs)`.
65///
66/// The data EEW equals `sew` (indexed loads take their data width from `vtype.vsew()`) with
67/// `EMUL = LMUL`, whereas the index group has EEW `index_eew` and `EMUL = (index_eew / sew) *
68/// LMUL`. Because the two groups can have different EEW, the general vector register overlap
69/// constraint applies: a destination group may overlap a source group only when one of the
70/// following holds:
71///
72/// - the EEWs are equal (the groups coincide); or
73/// - the destination EEW is smaller and the overlap is in the lowest-numbered part of the source
74///   group, i.e. the destination starts at the source's base register (`vd == vs2`); or
75/// - the destination EEW is larger, the source EMUL is at least one register, and the overlap is in
76///   the highest-numbered part of the destination group, i.e. both groups end at the same register
77///   (`vd + data_regs == vs2 + index_regs`).
78///
79/// Groups that do not overlap at all are always permitted. Any other overlap is reserved.
80///
81/// Unlike indexed *segment* loads (which forbid any `vd`/`vs2` overlap to remain restartable),
82/// these relaxed rules are what allow encodings such as `vluxei32.v v16, (s2), v16` when the data
83/// and index EEW match.
84#[inline(always)]
85#[doc(hidden)]
86pub fn indexed_load_overlap_allowed(
87    vd: VReg,
88    data_regs: u8,
89    vs2: VReg,
90    index_regs: u8,
91    index_eew: Eew,
92    sew: Vsew,
93    vlmul: Vlmul,
94) -> bool {
95    if !groups_overlap(vd, data_regs, vs2, index_regs) {
96        return true;
97    }
98
99    match sew.bytes_width().cmp(&index_eew.bytes_width()) {
100        // Equal EEW: the two groups coincide, overlap is permitted.
101        Ordering::Equal => true,
102        // Smaller data EEW: overlap must be in the lowest-numbered part of the index group, which
103        // (given both groups are alignment-checked) means the data group starts at the index base.
104        Ordering::Less => vd == vs2,
105        // Larger data EEW: overlap must be in the highest-numbered part of the data group, and the
106        // index EMUL must be at least one full register. `index_regs` alone cannot distinguish a
107        // whole-register EMUL from a fractional one clamped to a single register, so the EMUL is
108        // recomputed here as `(index_eew / sew) * LMUL >= 1`.
109        Ordering::Greater => {
110            let (lmul_num, lmul_den) = vlmul.as_fraction();
111            let index_emul_at_least_one = u16::from(index_eew.bits_width()) * u16::from(lmul_num)
112                >= u16::from(sew.bits_width()) * u16::from(lmul_den);
113            let (vd, vs2) = (vd.to_bits(), vs2.to_bits());
114            index_emul_at_least_one && vd + data_regs == vs2 + index_regs
115        }
116    }
117}
118
119/// Check that `vd` is aligned to `group_regs` and that the group fits within `[0, 32)`.
120///
121/// Per spec, the base register of every register group must be a multiple of the group size.
122#[inline(always)]
123#[doc(hidden)]
124pub fn check_register_group_alignment<Reg, Memory, PC, CustomError>(
125    program_counter: &PC,
126    vd: VReg,
127    group_regs: u8,
128) -> Result<(), ExecutionError<Reg::Type, CustomError>>
129where
130    Reg: Register,
131    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
132{
133    let vd = vd.to_bits();
134    if !vd.is_multiple_of(group_regs) || vd + group_regs > 32 {
135        return Err(ExecutionError::IllegalInstruction {
136            address: program_counter.old_pc(INSTRUCTION_SIZE),
137        });
138    }
139    Ok(())
140}
141
142/// Validate segment register layout: all `nf` field groups fit within `[0, 32)`, the base
143/// register is group-aligned, and the first field group does not include `v0` when masked.
144///
145/// Field `f` occupies registers `[vd + f * group_regs, vd + f * group_regs + group_regs)`.
146/// On `Ok`, `vd.to_bits() + nf * group_regs <= 32` is guaranteed.
147#[inline(always)]
148#[doc(hidden)]
149pub fn validate_segment_registers<Reg, Memory, PC, CustomError>(
150    program_counter: &PC,
151    vd: VReg,
152    vm: bool,
153    group_regs: u8,
154    nf: Nf,
155) -> Result<(), ExecutionError<Reg::Type, CustomError>>
156where
157    Reg: Register,
158    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
159{
160    let group_regs = u32::from(group_regs);
161    let nf = u32::from(nf.fields_per_segment());
162    let vd_idx = u32::from(vd.to_bits());
163    if vd_idx % group_regs != 0 || vd_idx + nf * group_regs > 32 {
164        return Err(ExecutionError::IllegalInstruction {
165            address: program_counter.old_pc(INSTRUCTION_SIZE),
166        });
167    }
168    // When masked, no field group may contain v0 (index 0). Since groups are laid out
169    // contiguously from vd and vd is group-aligned, only the first field (f=0) could contain
170    // v0, which happens exactly when vd == 0.
171    if !vm && vd_idx == 0 {
172        return Err(ExecutionError::IllegalInstruction {
173            address: program_counter.old_pc(INSTRUCTION_SIZE),
174        });
175    }
176    Ok(())
177}
178
179/// Read element `elem_i` from register group `[base_reg, base_reg + group_regs)` into a
180/// `[u8; Eew::MAX_BYTES]` buffer.
181///
182/// The in-register position of element `elem_i` is:
183///   - register `base_reg + elem_i / (VLENB / eew.bytes())`
184///   - byte offset `(elem_i % (VLENB / eew.bytes())) * eew.bytes()`
185///
186/// The result is placed in `buf[..eew.bytes()]`; the remaining bytes are zero.
187///
188/// # Safety
189/// `base_reg + elem_i / (VLENB / eew.bytes())` must be less than 32, i.e. `elem_i` must be
190/// a valid element index within the register group.
191#[inline(always)]
192pub(in super::super) unsafe fn read_group_element<const VLENB: usize>(
193    vregs: &VectorRegisterFile<VLENB>,
194    base_reg: VReg,
195    // TODO: `elem_i` here and in other places shouldn't be `u32`
196    elem_i: u32,
197    eew: Eew,
198) -> [u8; Eew::MAX_BYTES as usize] {
199    let elem_bytes = usize::from(eew.bytes_width());
200    let elems_per_reg = VLENB / elem_bytes;
201    let reg_off = elem_i as usize / elems_per_reg;
202    let byte_off = (elem_i as usize % elems_per_reg) * elem_bytes;
203    // SAFETY: `base_reg + reg_off < 32` by the caller's precondition
204    let reg = unsafe {
205        vregs.get(VReg::from_bits(base_reg.to_bits() + reg_off as u8).unwrap_unchecked())
206    };
207    // SAFETY: `byte_off + elem_bytes <= VLENB`: the maximum `byte_off` is
208    // `(elems_per_reg - 1) * elem_bytes = VLENB - elem_bytes`, so
209    // `byte_off + elem_bytes <= VLENB - elem_bytes + elem_bytes = VLENB`.
210    // `elem_bytes <= Eew::MAX_BYTES`: all `Eew` variants are at most E64.
211    let src = unsafe { reg.get_unchecked(byte_off..byte_off + elem_bytes) };
212    let mut buf = [0; _];
213    // SAFETY: `elem_bytes <= Eew::MAX_BYTES` as established above, so `..elem_bytes` is in bounds
214    // for `buf`
215    unsafe { buf.get_unchecked_mut(..elem_bytes) }.copy_from_slice(src);
216    buf
217}
218
219/// Write `eew`-sized data from `buf[..eew.bytes()]` into element `elem_i` of register group
220/// `[base_reg, base_reg + group_regs)`.
221///
222/// The in-register position follows the same layout as [`read_group_element`].
223///
224/// # Safety
225/// `base_reg + elem_i / (VLENB / eew.bytes())` must be less than 32, i.e. `elem_i` must be
226/// a valid element index within the register group.
227#[inline(always)]
228unsafe fn write_group_element<const VLENB: usize>(
229    vregs: &mut VectorRegisterFile<VLENB>,
230    base_reg: VReg,
231    elem_i: u32,
232    eew: Eew,
233    buf: [u8; Eew::MAX_BYTES as usize],
234) {
235    let elem_bytes = usize::from(eew.bytes_width());
236    let elems_per_reg = VLENB / elem_bytes;
237    let reg_off = elem_i as usize / elems_per_reg;
238    let byte_off = (elem_i as usize % elems_per_reg) * elem_bytes;
239    // SAFETY: `base_reg + reg_off < 32` by the caller's precondition
240    let reg = unsafe {
241        vregs.get_mut(VReg::from_bits(base_reg.to_bits() + reg_off as u8).unwrap_unchecked())
242    };
243    // SAFETY: `byte_off + elem_bytes <= VLENB` and `elem_bytes <= Eew::MAX_BYTES`: same argument as
244    // in `read_group_element`
245    let dst = unsafe { reg.get_unchecked_mut(byte_off..byte_off + elem_bytes) };
246    // SAFETY: `elem_bytes <= Eew::MAX_BYTES` as established above, so `..elem_bytes` is in bounds
247    // for `buf`
248    dst.copy_from_slice(unsafe { buf.get_unchecked(..elem_bytes) });
249}
250
251/// Read `eew`-sized data from memory at `addr` into a `[u8; Eew::MAX_BYTES]` buffer
252/// (little-endian)
253#[inline(always)]
254fn read_mem_element(
255    memory: &impl VirtualMemory,
256    addr: u64,
257    eew: Eew,
258) -> Result<[u8; Eew::MAX_BYTES as usize], VirtualMemoryError> {
259    let mut out = [0; _];
260    out[..usize::from(eew.bytes_width())]
261        .copy_from_slice(memory.read_slice(addr, u32::from(eew.bytes_width()))?);
262    Ok(out)
263}
264
265/// Execute a unit-stride or unit-stride segment load (including fault-only-first variants).
266///
267/// Segment stride between elements is `nf * eew.bytes()`. Field `f` for element `i` is at
268/// `base + i * nf * eew.bytes() + f * eew.bytes()`. When `nf == 1` this degenerates to a
269/// plain unit-stride load.
270///
271/// When `fault_only_first` is set: a memory error at element `i > 0` truncates `vl` to `i`
272/// and returns `Ok`. An error at element `0` always propagates.
273///
274/// # Safety
275/// - `vd.to_bits() % group_regs == 0`
276/// - `vd.to_bits() + nf * group_regs <= 32`
277/// - `vl <= group_regs * VLENB / eew.bytes()` (all `vl` elements fit within the destination
278///   register group; this holds when `vl` is the architectural `vl` and `group_regs` is the EMUL
279///   register count for the given `eew` and `vtype`)
280/// - When `vm=false`: `vd` does not overlap `v0` (i.e. `vd.to_bits() != 0`)
281#[inline(always)]
282#[expect(clippy::too_many_arguments, reason = "Internal API")]
283#[doc(hidden)]
284pub unsafe fn execute_unit_stride_load<Reg, ExtState, Memory, CustomError>(
285    ext_state: &mut ExtState,
286    memory: &Memory,
287    vd: VReg,
288    vm: bool,
289    base: u64,
290    eew: Eew,
291    group_regs: u8,
292    nf: Nf,
293    fault_only_first: bool,
294) -> Result<(), ExecutionError<Reg::Type, CustomError>>
295where
296    Reg: Register,
297    ExtState: VectorRegistersExt<Reg, CustomError>,
298    [(); ExtState::ELEN as usize]:,
299    [(); ExtState::VLEN as usize]:,
300    [(); ExtState::VLENB as usize]:,
301    Memory: VirtualMemory,
302    CustomError: fmt::Debug,
303{
304    let vl = ext_state.vl();
305    let vstart = ext_state.vstart();
306    let elem_bytes = eew.bytes_width();
307    let segment_stride = u64::from(nf.fields_per_segment()) * u64::from(elem_bytes);
308
309    // SAFETY: `vl <= VLMAX <= VLEN`, so `vl.div_ceil(8) <= VLENB`.
310    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
311
312    for i in u32::from(vstart)..vl {
313        if !vm && !mask_bit(&mask_buf, i) {
314            continue;
315        }
316
317        let elem_base = base.wrapping_add(u64::from(i) * segment_stride);
318
319        // Read all nf fields into a stack buffer before writing any of them.
320        // This ensures a fault on field f>0 leaves the destination registers untouched for the
321        // faulting element, so only elements with index new_vl are ever written (fault-only-first
322        // semantics).
323        //
324        // Sized by `Nf::MAX * Eew::MAX_BYTES`: the V spec allows at most 8 fields (nf in 1..=8)
325        // each is at most 8 bytes (E64), giving 64 bytes.
326        let mut field_buf =
327            [[0u8; usize::from(Eew::MAX_BYTES)]; usize::from(Nf::MAX.fields_per_segment())];
328
329        for f in 0..nf.fields_per_segment() {
330            let addr = elem_base.wrapping_add(u64::from(f * elem_bytes));
331            match read_mem_element(memory, addr, eew) {
332                Ok(data) => {
333                    // SAFETY: `f < nf` and the precondition on this function requires
334                    // `nf <= Nf::MAX` (the V spec encodes nf in 3 bits giving 1..=Nf::MAX, and the
335                    // decoder enforces this before constructing the instruction). Therefore, `f as
336                    // usize < nf as usize <= Nf::MAX`, which is exactly the length of `field_buf`.
337                    unsafe {
338                        *field_buf.get_unchecked_mut(f as usize) = data;
339                    }
340                }
341                Err(mem_err) => {
342                    if fault_only_first && i > 0 {
343                        ext_state.set_vl(i);
344                        ext_state.mark_vs_dirty();
345                        ext_state.reset_vstart();
346                        return Ok(());
347                    }
348                    if i > u32::from(vstart) {
349                        // Elements [vstart, i) were committed; VS is now dirty.
350                        ext_state.mark_vs_dirty();
351                        // vstart records the faulting element for restartability.
352                        ext_state.set_vstart(i as u16);
353                    }
354                    return Err(ExecutionError::MemoryAccess(mem_err));
355                }
356            }
357        }
358
359        // All nf fields for element i were read successfully; commit to the register file.
360        for f in 0..nf.fields_per_segment() {
361            // SAFETY: Guaranteed by function contract
362            let field_base_reg =
363                unsafe { VReg::from_bits(vd.to_bits() + f * group_regs).unwrap_unchecked() };
364            // SAFETY: need `field_base_reg + i / (VLENB / elem_bytes) < 32`.
365            //
366            // Let `elems_per_reg = VLENB / elem_bytes`.
367            // `i < vl <= group_regs * elems_per_reg` (precondition), so
368            // `i / elems_per_reg < group_regs`.
369            //
370            // `field_base_reg = vd.to_bits() + f * group_regs`. Since `f < nf` and the
371            // precondition guarantees `vd.to_bits() + nf * group_regs <= 32`:
372            // `field_base_reg + group_regs <= vd.to_bits() + (f+1) * group_regs
373            //                             <= vd.to_bits() + nf * group_regs <= 32`.
374            //
375            // Therefore, `field_base_reg + i / elems_per_reg
376            //            < field_base_reg + group_regs <= 32`.
377            //
378            // For `field_buf`: `f < nf <= Nf::MAX` (the same argument as in the read loop
379            // above), so `f as usize < Nf::MAX = field_buf.len()`.
380            unsafe {
381                write_group_element(
382                    ext_state.write_vregs(),
383                    field_base_reg,
384                    i,
385                    eew,
386                    *field_buf.get_unchecked(f as usize),
387                );
388            }
389        }
390    }
391
392    ext_state.mark_vs_dirty();
393    ext_state.reset_vstart();
394    Ok(())
395}
396
397/// Execute a strided or strided segment load.
398///
399/// `addr[i] = base + i * stride` where `stride` is a signed XLEN-wide value. Field `f` of
400/// element `i` is at `addr[i] + f * eew.bytes()`.
401///
402/// # Safety
403/// - `vd.to_bits() % group_regs == 0`
404/// - `vd.to_bits() + nf * group_regs <= 32`
405/// - `vl <= group_regs * VLENB / eew.bytes()`
406/// - When `vm=false`: `vd` does not overlap `v0` (i.e. `vd.to_bits() != 0`)
407#[inline(always)]
408#[expect(clippy::too_many_arguments, reason = "Internal API")]
409#[doc(hidden)]
410pub unsafe fn execute_strided_load<Reg, ExtState, Memory, CustomError>(
411    ext_state: &mut ExtState,
412    memory: &Memory,
413    vd: VReg,
414    vm: bool,
415    base: u64,
416    stride: i64,
417    eew: Eew,
418    group_regs: u8,
419    nf: Nf,
420) -> Result<(), ExecutionError<Reg::Type, CustomError>>
421where
422    Reg: Register,
423    ExtState: VectorRegistersExt<Reg, CustomError>,
424    [(); ExtState::ELEN as usize]:,
425    [(); ExtState::VLEN as usize]:,
426    [(); ExtState::VLENB as usize]:,
427    Memory: VirtualMemory,
428    CustomError: fmt::Debug,
429{
430    let vl = ext_state.vl();
431    let vstart = ext_state.vstart();
432    let elem_bytes = eew.bytes_width();
433
434    // SAFETY: `vl <= VLMAX <= VLEN` (precondition), so `vl.div_ceil(8) <= VLEN / 8 = VLENB`.
435    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
436
437    for i in u32::from(vstart)..vl {
438        if !vm && !mask_bit(&mask_buf, i) {
439            continue;
440        }
441
442        let elem_base = base.wrapping_add(i64::from(i).wrapping_mul(stride).cast_unsigned());
443
444        for f in 0..nf.fields_per_segment() {
445            let addr = elem_base.wrapping_add(u64::from(f * elem_bytes));
446            let data = match read_mem_element(memory, addr, eew) {
447                Ok(data) => data,
448                Err(mem_err) => {
449                    if f > 0 || i > u32::from(vstart) {
450                        ext_state.mark_vs_dirty();
451                        ext_state.set_vstart(i as u16);
452                    }
453                    return Err(ExecutionError::MemoryAccess(mem_err));
454                }
455            };
456            // SAFETY: Guaranteed by function contract
457            let field_base_reg =
458                unsafe { VReg::from_bits(vd.to_bits() + f * group_regs).unwrap_unchecked() };
459            // SAFETY: need `field_base_reg + i / (VLENB / elem_bytes) < 32`.
460            //
461            // Let `elems_per_reg = VLENB / elem_bytes`.
462            // `i < vl <= group_regs * elems_per_reg` (precondition), so
463            // `i / elems_per_reg < group_regs`.
464            //
465            // `field_base_reg = vd.to_bits() + f * group_regs`. Since `f < nf` and
466            // `vd.to_bits() + nf * group_regs <= 32` (precondition):
467            // `field_base_reg + group_regs <= vd.to_bits() + (f+1) * group_regs
468            //                             <= vd.to_bits() + nf * group_regs <= 32`.
469            //
470            // Therefore, `field_base_reg + i / elems_per_reg < field_base_reg + group_regs <= 32`.
471            unsafe {
472                write_group_element(ext_state.write_vregs(), field_base_reg, i, eew, data);
473            }
474        }
475    }
476
477    ext_state.mark_vs_dirty();
478    ext_state.reset_vstart();
479    Ok(())
480}
481
482/// Execute an indexed (unordered or ordered) or indexed segment load.
483///
484/// For element `i`, reads `index_eew`-sized bytes from register group `vs2` at element `i`
485/// to obtain a zero-extended byte offset, then loads `nf` data fields from
486/// `base + offset + f * data_eew.bytes()`. Unordered vs ordered is functionally identical in
487/// a software interpreter.
488///
489/// # Safety
490/// - `vd.to_bits() % data_group_regs == 0`
491/// - `vd.to_bits() + nf * data_group_regs <= 32`
492/// - `vs2.to_bits() + (vl - 1) / (VLENB / index_eew.bytes()) < 32` (all `vl` index elements fit
493///   within the register file; satisfied when `vs2` is alignment-checked against `EMUL_index` and
494///   `vl` is the architectural `vl` bounded by `VLMAX`)
495/// - `vl <= data_group_regs * VLENB / data_eew.bytes()` (all `vl` elements fit in a data group)
496/// - When `vm=false`: `vd` does not overlap `v0` (i.e. `vd.to_bits() != 0`)
497#[inline(always)]
498#[expect(clippy::too_many_arguments, reason = "Internal API")]
499#[doc(hidden)]
500pub unsafe fn execute_indexed_load<Reg, ExtState, Memory, CustomError>(
501    ext_state: &mut ExtState,
502    memory: &Memory,
503    vd: VReg,
504    vs2: VReg,
505    vm: bool,
506    base: u64,
507    data_eew: Eew,
508    index_eew: Eew,
509    data_group_regs: u8,
510    nf: Nf,
511) -> Result<(), ExecutionError<Reg::Type, CustomError>>
512where
513    Reg: Register,
514    ExtState: VectorRegistersExt<Reg, CustomError>,
515    [(); ExtState::ELEN as usize]:,
516    [(); ExtState::VLEN as usize]:,
517    [(); ExtState::VLENB as usize]:,
518    Memory: VirtualMemory,
519    CustomError: fmt::Debug,
520{
521    let vl = ext_state.vl();
522    let vstart = ext_state.vstart();
523    let index_base_reg = vs2;
524
525    // SAFETY: `vl <= VLMAX <= VLEN` (precondition), so `vl.div_ceil(8) <= VLEN / 8 = VLENB`.
526    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
527
528    for i in u32::from(vstart)..vl {
529        if !vm && !mask_bit(&mask_buf, i) {
530            continue;
531        }
532
533        // SAFETY: need `index_base_reg + i / (VLENB / index_eew.bytes()) < 32`.
534        //
535        // The caller verified `vs2` is aligned to `EMUL_index` registers and that
536        // `vs2.to_bits() + EMUL_index <= 32`. `EMUL_index` is defined so that
537        // `EMUL_index * (VLENB / index_eew.bytes()) = VLMAX`. Since `i < vl <= VLMAX`,
538        // `i / (VLENB / index_eew.bytes()) < EMUL_index`, and therefore
539        // `index_base_reg + i / (VLENB / index_eew.bytes()) < index_base_reg + EMUL_index <= 32`.
540        let index_buf =
541            unsafe { read_group_element(ext_state.read_vregs(), index_base_reg, i, index_eew) };
542        let offset = u64::from_le_bytes(index_buf);
543        let elem_addr = base.wrapping_add(offset);
544
545        let data_elem_bytes = data_eew.bytes_width();
546        for f in 0..nf.fields_per_segment() {
547            let addr = elem_addr.wrapping_add(u64::from(f) * u64::from(data_elem_bytes));
548            let data = match read_mem_element(memory, addr, data_eew) {
549                Ok(data) => data,
550                Err(mem_err) => {
551                    if f > 0 || i > u32::from(vstart) {
552                        ext_state.mark_vs_dirty();
553                        ext_state.set_vstart(i as u16);
554                    }
555                    return Err(ExecutionError::MemoryAccess(mem_err));
556                }
557            };
558            // SAFETY: Guaranteed by function contract
559            let field_base_reg =
560                unsafe { VReg::from_bits(vd.to_bits() + f * data_group_regs).unwrap_unchecked() };
561            // SAFETY: need `field_base_reg + i / (VLENB / data_eew.bytes()) < 32`.
562            //
563            // Let `data_elems_per_reg = VLENB / data_eew.bytes()`.
564            // `i < vl <= data_group_regs * data_elems_per_reg` (precondition), so
565            // `i / data_elems_per_reg < data_group_regs`.
566            //
567            // `field_base_reg = vd.to_bits() + f * data_group_regs`. Since `f < nf` and
568            // `vd.to_bits() + nf * data_group_regs <= 32` (precondition):
569            // `field_base_reg + data_group_regs <= vd.to_bits() + (f+1) * data_group_regs
570            //                                  <= vd.to_bits() + nf * data_group_regs <= 32`.
571            //
572            // Therefore,
573            // `field_base_reg + i / data_elems_per_reg < field_base_reg + data_group_regs <= 32`.
574            unsafe {
575                write_group_element(ext_state.write_vregs(), field_base_reg, i, data_eew, data);
576            }
577        }
578    }
579
580    ext_state.mark_vs_dirty();
581    ext_state.reset_vstart();
582    Ok(())
583}