Skip to main content

ab_riscv_interpreter/v/zvexx/load/
zvexx_load_helpers.rs

1//! Opaque helpers for ZveXx extension
2
3use crate::v::vector_registers::{VectorRegisterFile, VectorRegistersExt};
4use crate::v::zvexx::zvexx_helpers::INSTRUCTION_SIZE;
5use crate::{ExecutionError, ProgramCounter, VirtualMemory, VirtualMemoryError};
6use ab_riscv_primitives::prelude::*;
7use core::cmp::Ordering;
8use core::fmt;
9use core::hint::cold_path;
10
11/// Return whether mask bit `i` is set in the mask byte slice.
12///
13/// Bits are stored LSB-first within each byte: bit `i` is at byte `i / 8`, position `i % 8`.
14/// Returns `false` for any `i` outside the slice bounds.
15#[inline(always)]
16pub(crate) fn mask_bit(mask: &[u8], i: u32) -> bool {
17    mask.get((i / u8::BITS) as usize)
18        .is_some_and(|b| (b >> (i % u8::BITS)) & 1 != 0)
19}
20
21/// Copy the mask bytes needed to cover `vl` elements from `v0` into a stack buffer and return
22/// it. The copy releases the shared borrow on the register file so the caller can immediately
23/// take an exclusive borrow for writes.
24///
25/// When `vm=true` (unmasked), the buffer is filled with `0xff` so that every mask bit reads as `1`.
26/// This means callers can unconditionally call [`mask_bit()`] on the returned buffer without
27/// branching on `vm`. Current callers short-circuit with `!vm &&` before calling [`mask_bit()`] as
28/// a micro-optimization on the common unmasked path, but correctness does not depend on that guard:
29/// if it were removed, the `0xff` fill ensures [`mask_bit()`] would return `true` for every
30/// element, preserving the unmasked semantics.
31///
32/// # Safety
33/// `vl.div_ceil(8)` must be `<= VLENB`. This holds when `vl <= VLEN`, which is always true
34/// when `vl` is the current architectural `vl` (bounded by `VLMAX <= VLEN`).
35#[inline(always)]
36pub(in super::super) unsafe fn snapshot_mask<const VLENB: usize>(
37    vregs: &VectorRegisterFile<VLENB>,
38    vm: bool,
39    vl: u32,
40) -> [u8; VLENB] {
41    let mut buf = [0u8; VLENB];
42    if vm {
43        // All-ones: every element active
44        buf = [0xffu8; VLENB];
45    } else {
46        let mask_bytes = vl.div_ceil(u8::BITS) as usize;
47        // SAFETY: `mask_bytes <= VLENB` by the caller's precondition
48        unsafe {
49            buf.get_unchecked_mut(..mask_bytes)
50                .copy_from_slice(vregs.get(VReg::V0).get_unchecked(..mask_bytes));
51        }
52    }
53    buf
54}
55
56/// Return whether register groups `[a, a+a_regs)` and `[b, b+b_regs)` overlap.
57#[inline(always)]
58#[doc(hidden)]
59pub fn groups_overlap(a: VReg, a_regs: u8, b: VReg, b_regs: u8) -> bool {
60    let (a, b) = (a.to_bits(), b.to_bits());
61    a < b + b_regs && b < a + a_regs
62}
63
64/// Return whether a *non-segment* indexed load's data destination group
65/// `[vd, vd + data_regs)` may legally overlap its index source group `[vs2, vs2 + index_regs)`.
66///
67/// The data EEW equals `sew` (indexed loads take their data width from `vtype.vsew()`) with
68/// `EMUL = LMUL`, whereas the index group has EEW `index_eew` and `EMUL = (index_eew / sew) *
69/// LMUL`. Because the two groups can have different EEW, the general vector register overlap
70/// constraint applies: a destination group may overlap a source group only when one of the
71/// following holds:
72///
73/// - the EEWs are equal (the groups coincide); or
74/// - the destination EEW is smaller and the overlap is in the lowest-numbered part of the source
75///   group, i.e. the destination starts at the source's base register (`vd == vs2`); or
76/// - the destination EEW is larger, the source EMUL is at least one register, and the overlap is in
77///   the highest-numbered part of the destination group, i.e. both groups end at the same register
78///   (`vd + data_regs == vs2 + index_regs`).
79///
80/// Groups that do not overlap at all are always permitted. Any other overlap is reserved.
81///
82/// Unlike indexed *segment* loads (which forbid any `vd`/`vs2` overlap to remain restartable),
83/// these relaxed rules are what allow encodings such as `vluxei32.v v16, (s2), v16` when the data
84/// and index EEW match.
85#[inline(always)]
86#[doc(hidden)]
87pub fn indexed_load_overlap_allowed(
88    vd: VReg,
89    data_regs: u8,
90    vs2: VReg,
91    index_regs: u8,
92    index_eew: Eew,
93    sew: Vsew,
94    vlmul: Vlmul,
95) -> bool {
96    if !groups_overlap(vd, data_regs, vs2, index_regs) {
97        return true;
98    }
99
100    match sew.bytes_width().cmp(&index_eew.bytes_width()) {
101        // Equal EEW: the two groups coincide, overlap is permitted.
102        Ordering::Equal => true,
103        // Smaller data EEW: overlap must be in the lowest-numbered part of the index group, which
104        // (given both groups are alignment-checked) means the data group starts at the index base.
105        Ordering::Less => vd == vs2,
106        // Larger data EEW: overlap must be in the highest-numbered part of the data group, and the
107        // index EMUL must be at least one full register. `index_regs` alone cannot distinguish a
108        // whole-register EMUL from a fractional one clamped to a single register, so the EMUL is
109        // recomputed here as `(index_eew / sew) * LMUL >= 1`.
110        Ordering::Greater => {
111            let (lmul_num, lmul_den) = vlmul.as_fraction();
112            let index_emul_at_least_one = u16::from(index_eew.bits_width()) * u16::from(lmul_num)
113                >= u16::from(sew.bits_width()) * u16::from(lmul_den);
114            let (vd, vs2) = (vd.to_bits(), vs2.to_bits());
115            index_emul_at_least_one && vd + data_regs == vs2 + index_regs
116        }
117    }
118}
119
120/// Check that `vd` is aligned to `group_regs` and that the group fits within `[0, 32)`.
121///
122/// Per spec, the base register of every register group must be a multiple of the group size.
123#[inline(always)]
124#[doc(hidden)]
125pub fn check_register_group_alignment<Reg, Memory, PC, CustomError>(
126    program_counter: &PC,
127    vd: VReg,
128    group_regs: u8,
129) -> Result<(), ExecutionError<Reg::Type, CustomError>>
130where
131    Reg: Register,
132    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
133{
134    let vd = vd.to_bits();
135    if !vd.is_multiple_of(group_regs) || vd + group_regs > 32 {
136        cold_path();
137        return Err(ExecutionError::IllegalInstruction {
138            address: program_counter.old_pc(INSTRUCTION_SIZE),
139        });
140    }
141    Ok(())
142}
143
144/// Validate segment register layout: all `nf` field groups fit within `[0, 32)`, the base
145/// register is group-aligned, and the first field group does not include `v0` when masked.
146///
147/// Field `f` occupies registers `[vd + f * group_regs, vd + f * group_regs + group_regs)`.
148/// On `Ok`, `vd.to_bits() + nf * group_regs <= 32` is guaranteed.
149#[inline(always)]
150#[doc(hidden)]
151pub fn validate_segment_registers<Reg, Memory, PC, CustomError>(
152    program_counter: &PC,
153    vd: VReg,
154    vm: bool,
155    group_regs: u8,
156    nf: Nf,
157) -> Result<(), ExecutionError<Reg::Type, CustomError>>
158where
159    Reg: Register,
160    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
161{
162    let group_regs = u32::from(group_regs);
163    let nf = u32::from(nf.fields_per_segment());
164    let vd_idx = u32::from(vd.to_bits());
165    if vd_idx % group_regs != 0 || vd_idx + nf * group_regs > 32 {
166        cold_path();
167        return Err(ExecutionError::IllegalInstruction {
168            address: program_counter.old_pc(INSTRUCTION_SIZE),
169        });
170    }
171    // When masked, no field group may contain v0 (index 0). Since groups are laid out
172    // contiguously from vd and vd is group-aligned, only the first field (f=0) could contain
173    // v0, which happens exactly when vd == 0.
174    if !vm && vd_idx == 0 {
175        cold_path();
176        return Err(ExecutionError::IllegalInstruction {
177            address: program_counter.old_pc(INSTRUCTION_SIZE),
178        });
179    }
180    Ok(())
181}
182
183/// Read element `elem_i` from register group `[base_reg, base_reg + group_regs)` into a
184/// `[u8; Eew::MAX_BYTES]` buffer.
185///
186/// The in-register position of element `elem_i` is:
187///   - register `base_reg + elem_i / (VLENB / eew.bytes())`
188///   - byte offset `(elem_i % (VLENB / eew.bytes())) * eew.bytes()`
189///
190/// The result is placed in `buf[..eew.bytes()]`; the remaining bytes are zero.
191///
192/// # Safety
193/// `base_reg + elem_i / (VLENB / eew.bytes())` must be less than 32, i.e. `elem_i` must be
194/// a valid element index within the register group.
195#[inline(always)]
196pub(in super::super) unsafe fn read_group_element<const VLENB: usize>(
197    vregs: &VectorRegisterFile<VLENB>,
198    base_reg: VReg,
199    // TODO: `elem_i` here and in other places shouldn't be `u32`
200    elem_i: u32,
201    eew: Eew,
202) -> [u8; Eew::MAX_BYTES as usize] {
203    let elem_bytes = usize::from(eew.bytes_width());
204    let elems_per_reg = VLENB / elem_bytes;
205    let reg_off = elem_i as usize / elems_per_reg;
206    let byte_off = (elem_i as usize % elems_per_reg) * elem_bytes;
207    // SAFETY: `base_reg + reg_off < 32` by the caller's precondition
208    let reg = unsafe {
209        vregs.get(VReg::from_bits(base_reg.to_bits() + reg_off as u8).unwrap_unchecked())
210    };
211    // SAFETY: `byte_off + elem_bytes <= VLENB`: the maximum `byte_off` is
212    // `(elems_per_reg - 1) * elem_bytes = VLENB - elem_bytes`, so
213    // `byte_off + elem_bytes <= VLENB - elem_bytes + elem_bytes = VLENB`.
214    // `elem_bytes <= Eew::MAX_BYTES`: all `Eew` variants are at most E64.
215    let src = unsafe { reg.get_unchecked(byte_off..byte_off + elem_bytes) };
216    let mut buf = [0; _];
217    // SAFETY: `elem_bytes <= Eew::MAX_BYTES` as established above, so `..elem_bytes` is in bounds
218    // for `buf`
219    unsafe { buf.get_unchecked_mut(..elem_bytes) }.copy_from_slice(src);
220    buf
221}
222
223/// Write `eew`-sized data from `buf[..eew.bytes()]` into element `elem_i` of register group
224/// `[base_reg, base_reg + group_regs)`.
225///
226/// The in-register position follows the same layout as [`read_group_element`].
227///
228/// # Safety
229/// `base_reg + elem_i / (VLENB / eew.bytes())` must be less than 32, i.e. `elem_i` must be
230/// a valid element index within the register group.
231#[inline(always)]
232unsafe fn write_group_element<const VLENB: usize>(
233    vregs: &mut VectorRegisterFile<VLENB>,
234    base_reg: VReg,
235    elem_i: u32,
236    eew: Eew,
237    buf: [u8; Eew::MAX_BYTES as usize],
238) {
239    let elem_bytes = usize::from(eew.bytes_width());
240    let elems_per_reg = VLENB / elem_bytes;
241    let reg_off = elem_i as usize / elems_per_reg;
242    let byte_off = (elem_i as usize % elems_per_reg) * elem_bytes;
243    // SAFETY: `base_reg + reg_off < 32` by the caller's precondition
244    let reg = unsafe {
245        vregs.get_mut(VReg::from_bits(base_reg.to_bits() + reg_off as u8).unwrap_unchecked())
246    };
247    // SAFETY: `byte_off + elem_bytes <= VLENB` and `elem_bytes <= Eew::MAX_BYTES`: same argument as
248    // in `read_group_element`
249    let dst = unsafe { reg.get_unchecked_mut(byte_off..byte_off + elem_bytes) };
250    // SAFETY: `elem_bytes <= Eew::MAX_BYTES` as established above, so `..elem_bytes` is in bounds
251    // for `buf`
252    dst.copy_from_slice(unsafe { buf.get_unchecked(..elem_bytes) });
253}
254
255/// Read `eew`-sized data from memory at `addr` into a `[u8; Eew::MAX_BYTES]` buffer
256/// (little-endian)
257#[inline(always)]
258fn read_mem_element(
259    memory: &impl VirtualMemory,
260    addr: u64,
261    eew: Eew,
262) -> Result<[u8; Eew::MAX_BYTES as usize], VirtualMemoryError> {
263    let source = match memory.read_slice(addr, u32::from(eew.bytes_width())) {
264        Ok(source) => source,
265        Err(err) => {
266            cold_path();
267            return Err(err);
268        }
269    };
270    let mut out = [0; _];
271    out[..usize::from(eew.bytes_width())].copy_from_slice(source);
272    Ok(out)
273}
274
275/// Execute a unit-stride or unit-stride segment load (including fault-only-first variants).
276///
277/// Segment stride between elements is `nf * eew.bytes()`. Field `f` for element `i` is at
278/// `base + i * nf * eew.bytes() + f * eew.bytes()`. When `nf == 1` this degenerates to a
279/// plain unit-stride load.
280///
281/// When `fault_only_first` is set: a memory error at element `i > 0` truncates `vl` to `i`
282/// and returns `Ok`. An error at element `0` always propagates.
283///
284/// # Safety
285/// - `vd.to_bits() % group_regs == 0`
286/// - `vd.to_bits() + nf * group_regs <= 32`
287/// - `vl <= group_regs * VLENB / eew.bytes()` (all `vl` elements fit within the destination
288///   register group; this holds when `vl` is the architectural `vl` and `group_regs` is the EMUL
289///   register count for the given `eew` and `vtype`)
290/// - When `vm=false`: `vd` does not overlap `v0` (i.e. `vd.to_bits() != 0`)
291#[inline(always)]
292#[expect(clippy::too_many_arguments, reason = "Internal API")]
293#[doc(hidden)]
294pub unsafe fn execute_unit_stride_load<
295    const FAULT_ONLY_FIRST: bool,
296    Reg,
297    ExtState,
298    Memory,
299    CustomError,
300>(
301    ext_state: &mut ExtState,
302    memory: &Memory,
303    vd: VReg,
304    vm: bool,
305    base: u64,
306    eew: Eew,
307    group_regs: u8,
308    nf: Nf,
309) -> Result<(), ExecutionError<Reg::Type, CustomError>>
310where
311    Reg: Register,
312    ExtState: VectorRegistersExt<Reg, CustomError>,
313    [(); ExtState::ELEN as usize]:,
314    [(); ExtState::VLEN as usize]:,
315    [(); ExtState::VLENB as usize]:,
316    Memory: VirtualMemory,
317    CustomError: fmt::Debug,
318{
319    let vl = ext_state.vl();
320    let vstart = ext_state.vstart();
321    let elem_bytes = eew.bytes_width();
322    let segment_stride = u64::from(nf.fields_per_segment()) * u64::from(elem_bytes);
323
324    // SAFETY: `vl <= VLMAX <= VLEN`, so `vl.div_ceil(8) <= VLENB`.
325    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
326
327    for i in u32::from(vstart)..vl {
328        if !vm && !mask_bit(&mask_buf, i) {
329            continue;
330        }
331
332        let elem_base = base.wrapping_add(u64::from(i) * segment_stride);
333
334        // Read all nf fields into a stack buffer before writing any of them.
335        // This ensures a fault on field f>0 leaves the destination registers untouched for the
336        // faulting element, so only elements with index new_vl are ever written (fault-only-first
337        // semantics).
338        //
339        // Sized by `Nf::MAX * Eew::MAX_BYTES`: the V spec allows at most 8 fields (nf in 1..=8)
340        // each is at most 8 bytes (E64), giving 64 bytes.
341        let mut field_buf =
342            [[0u8; usize::from(Eew::MAX_BYTES)]; usize::from(Nf::MAX.fields_per_segment())];
343
344        for f in 0..nf.fields_per_segment() {
345            let addr = elem_base.wrapping_add(u64::from(f * elem_bytes));
346            match read_mem_element(memory, addr, eew) {
347                Ok(data) => {
348                    // SAFETY: `f < nf` and the precondition on this function requires
349                    // `nf <= Nf::MAX` (the V spec encodes nf in 3 bits giving 1..=Nf::MAX, and the
350                    // decoder enforces this before constructing the instruction). Therefore, `f as
351                    // usize < nf as usize <= Nf::MAX`, which is exactly the length of `field_buf`.
352                    unsafe {
353                        *field_buf.get_unchecked_mut(f as usize) = data;
354                    }
355                }
356                Err(mem_err) => {
357                    cold_path();
358                    if FAULT_ONLY_FIRST && i > 0 {
359                        ext_state.set_vl(i);
360                        ext_state.mark_vs_dirty();
361                        ext_state.reset_vstart();
362                        return Ok(());
363                    }
364                    if i > u32::from(vstart) {
365                        // Elements [vstart, i) were committed; VS is now dirty.
366                        ext_state.mark_vs_dirty();
367                        // vstart records the faulting element for restartability.
368                        ext_state.set_vstart(i as u16);
369                    }
370                    return Err(ExecutionError::MemoryAccess(mem_err));
371                }
372            }
373        }
374
375        // All nf fields for element i were read successfully; commit to the register file.
376        for f in 0..nf.fields_per_segment() {
377            // SAFETY: Guaranteed by function contract
378            let field_base_reg =
379                unsafe { VReg::from_bits(vd.to_bits() + f * group_regs).unwrap_unchecked() };
380            // SAFETY: need `field_base_reg + i / (VLENB / elem_bytes) < 32`.
381            //
382            // Let `elems_per_reg = VLENB / elem_bytes`.
383            // `i < vl <= group_regs * elems_per_reg` (precondition), so
384            // `i / elems_per_reg < group_regs`.
385            //
386            // `field_base_reg = vd.to_bits() + f * group_regs`. Since `f < nf` and the
387            // precondition guarantees `vd.to_bits() + nf * group_regs <= 32`:
388            // `field_base_reg + group_regs <= vd.to_bits() + (f+1) * group_regs
389            //                             <= vd.to_bits() + nf * group_regs <= 32`.
390            //
391            // Therefore, `field_base_reg + i / elems_per_reg
392            //            < field_base_reg + group_regs <= 32`.
393            //
394            // For `field_buf`: `f < nf <= Nf::MAX` (the same argument as in the read loop
395            // above), so `f as usize < Nf::MAX = field_buf.len()`.
396            unsafe {
397                write_group_element(
398                    ext_state.write_vregs(),
399                    field_base_reg,
400                    i,
401                    eew,
402                    *field_buf.get_unchecked(f as usize),
403                );
404            }
405        }
406    }
407
408    ext_state.mark_vs_dirty();
409    ext_state.reset_vstart();
410    Ok(())
411}
412
413/// Execute a strided or strided segment load.
414///
415/// `addr[i] = base + i * stride` where `stride` is a signed XLEN-wide value. Field `f` of
416/// element `i` is at `addr[i] + f * eew.bytes()`.
417///
418/// # Safety
419/// - `vd.to_bits() % group_regs == 0`
420/// - `vd.to_bits() + nf * group_regs <= 32`
421/// - `vl <= group_regs * VLENB / eew.bytes()`
422/// - When `vm=false`: `vd` does not overlap `v0` (i.e. `vd.to_bits() != 0`)
423#[inline(always)]
424#[expect(clippy::too_many_arguments, reason = "Internal API")]
425#[doc(hidden)]
426pub unsafe fn execute_strided_load<Reg, ExtState, Memory, CustomError>(
427    ext_state: &mut ExtState,
428    memory: &Memory,
429    vd: VReg,
430    vm: bool,
431    base: u64,
432    stride: i64,
433    eew: Eew,
434    group_regs: u8,
435    nf: Nf,
436) -> Result<(), ExecutionError<Reg::Type, CustomError>>
437where
438    Reg: Register,
439    ExtState: VectorRegistersExt<Reg, CustomError>,
440    [(); ExtState::ELEN as usize]:,
441    [(); ExtState::VLEN as usize]:,
442    [(); ExtState::VLENB as usize]:,
443    Memory: VirtualMemory,
444    CustomError: fmt::Debug,
445{
446    let vl = ext_state.vl();
447    let vstart = ext_state.vstart();
448    let elem_bytes = eew.bytes_width();
449
450    // SAFETY: `vl <= VLMAX <= VLEN` (precondition), so `vl.div_ceil(8) <= VLEN / 8 = VLENB`.
451    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
452
453    for i in u32::from(vstart)..vl {
454        if !vm && !mask_bit(&mask_buf, i) {
455            continue;
456        }
457
458        let elem_base = base.wrapping_add(i64::from(i).wrapping_mul(stride).cast_unsigned());
459
460        for f in 0..nf.fields_per_segment() {
461            let addr = elem_base.wrapping_add(u64::from(f * elem_bytes));
462            let data = match read_mem_element(memory, addr, eew) {
463                Ok(data) => data,
464                Err(mem_err) => {
465                    cold_path();
466                    if f > 0 || i > u32::from(vstart) {
467                        ext_state.mark_vs_dirty();
468                        ext_state.set_vstart(i as u16);
469                    }
470                    return Err(ExecutionError::MemoryAccess(mem_err));
471                }
472            };
473            // SAFETY: Guaranteed by function contract
474            let field_base_reg =
475                unsafe { VReg::from_bits(vd.to_bits() + f * group_regs).unwrap_unchecked() };
476            // SAFETY: need `field_base_reg + i / (VLENB / elem_bytes) < 32`.
477            //
478            // Let `elems_per_reg = VLENB / elem_bytes`.
479            // `i < vl <= group_regs * elems_per_reg` (precondition), so
480            // `i / elems_per_reg < group_regs`.
481            //
482            // `field_base_reg = vd.to_bits() + f * group_regs`. Since `f < nf` and
483            // `vd.to_bits() + nf * group_regs <= 32` (precondition):
484            // `field_base_reg + group_regs <= vd.to_bits() + (f+1) * group_regs
485            //                             <= vd.to_bits() + nf * group_regs <= 32`.
486            //
487            // Therefore, `field_base_reg + i / elems_per_reg < field_base_reg + group_regs <= 32`.
488            unsafe {
489                write_group_element(ext_state.write_vregs(), field_base_reg, i, eew, data);
490            }
491        }
492    }
493
494    ext_state.mark_vs_dirty();
495    ext_state.reset_vstart();
496    Ok(())
497}
498
499/// Execute an indexed (unordered or ordered) or indexed segment load.
500///
501/// For element `i`, reads `index_eew`-sized bytes from register group `vs2` at element `i`
502/// to obtain a zero-extended byte offset, then loads `nf` data fields from
503/// `base + offset + f * data_eew.bytes()`. Unordered vs ordered is functionally identical in
504/// a software interpreter.
505///
506/// # Safety
507/// - `vd.to_bits() % data_group_regs == 0`
508/// - `vd.to_bits() + nf * data_group_regs <= 32`
509/// - `vs2.to_bits() + (vl - 1) / (VLENB / index_eew.bytes()) < 32` (all `vl` index elements fit
510///   within the register file; satisfied when `vs2` is alignment-checked against `EMUL_index` and
511///   `vl` is the architectural `vl` bounded by `VLMAX`)
512/// - `vl <= data_group_regs * VLENB / data_eew.bytes()` (all `vl` elements fit in a data group)
513/// - When `vm=false`: `vd` does not overlap `v0` (i.e. `vd.to_bits() != 0`)
514#[inline(always)]
515#[expect(clippy::too_many_arguments, reason = "Internal API")]
516#[doc(hidden)]
517pub unsafe fn execute_indexed_load<Reg, ExtState, Memory, CustomError>(
518    ext_state: &mut ExtState,
519    memory: &Memory,
520    vd: VReg,
521    vs2: VReg,
522    vm: bool,
523    base: u64,
524    data_eew: Eew,
525    index_eew: Eew,
526    data_group_regs: u8,
527    nf: Nf,
528) -> Result<(), ExecutionError<Reg::Type, CustomError>>
529where
530    Reg: Register,
531    ExtState: VectorRegistersExt<Reg, CustomError>,
532    [(); ExtState::ELEN as usize]:,
533    [(); ExtState::VLEN as usize]:,
534    [(); ExtState::VLENB as usize]:,
535    Memory: VirtualMemory,
536    CustomError: fmt::Debug,
537{
538    let vl = ext_state.vl();
539    let vstart = ext_state.vstart();
540    let index_base_reg = vs2;
541
542    // SAFETY: `vl <= VLMAX <= VLEN` (precondition), so `vl.div_ceil(8) <= VLEN / 8 = VLENB`.
543    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
544
545    for i in u32::from(vstart)..vl {
546        if !vm && !mask_bit(&mask_buf, i) {
547            continue;
548        }
549
550        // SAFETY: need `index_base_reg + i / (VLENB / index_eew.bytes()) < 32`.
551        //
552        // The caller verified `vs2` is aligned to `EMUL_index` registers and that
553        // `vs2.to_bits() + EMUL_index <= 32`. `EMUL_index` is defined so that
554        // `EMUL_index * (VLENB / index_eew.bytes()) = VLMAX`. Since `i < vl <= VLMAX`,
555        // `i / (VLENB / index_eew.bytes()) < EMUL_index`, and therefore
556        // `index_base_reg + i / (VLENB / index_eew.bytes()) < index_base_reg + EMUL_index <= 32`.
557        let index_buf =
558            unsafe { read_group_element(ext_state.read_vregs(), index_base_reg, i, index_eew) };
559        let offset = u64::from_le_bytes(index_buf);
560        let elem_addr = base.wrapping_add(offset);
561
562        let data_elem_bytes = data_eew.bytes_width();
563        for f in 0..nf.fields_per_segment() {
564            let addr = elem_addr.wrapping_add(u64::from(f) * u64::from(data_elem_bytes));
565            let data = match read_mem_element(memory, addr, data_eew) {
566                Ok(data) => data,
567                Err(mem_err) => {
568                    cold_path();
569                    if f > 0 || i > u32::from(vstart) {
570                        ext_state.mark_vs_dirty();
571                        ext_state.set_vstart(i as u16);
572                    }
573                    return Err(ExecutionError::MemoryAccess(mem_err));
574                }
575            };
576            // SAFETY: Guaranteed by function contract
577            let field_base_reg =
578                unsafe { VReg::from_bits(vd.to_bits() + f * data_group_regs).unwrap_unchecked() };
579            // SAFETY: need `field_base_reg + i / (VLENB / data_eew.bytes()) < 32`.
580            //
581            // Let `data_elems_per_reg = VLENB / data_eew.bytes()`.
582            // `i < vl <= data_group_regs * data_elems_per_reg` (precondition), so
583            // `i / data_elems_per_reg < data_group_regs`.
584            //
585            // `field_base_reg = vd.to_bits() + f * data_group_regs`. Since `f < nf` and
586            // `vd.to_bits() + nf * data_group_regs <= 32` (precondition):
587            // `field_base_reg + data_group_regs <= vd.to_bits() + (f+1) * data_group_regs
588            //                                  <= vd.to_bits() + nf * data_group_regs <= 32`.
589            //
590            // Therefore,
591            // `field_base_reg + i / data_elems_per_reg < field_base_reg + data_group_regs <= 32`.
592            unsafe {
593                write_group_element(ext_state.write_vregs(), field_base_reg, i, data_eew, data);
594            }
595        }
596    }
597
598    ext_state.mark_vs_dirty();
599    ext_state.reset_vstart();
600    Ok(())
601}