ab_riscv_interpreter/v/zvexx/load/zvexx_load_helpers.rs
1//! Opaque helpers for ZveXx extension
2
3use crate::v::vector_registers::{VectorRegisterFile, VectorRegistersExt};
4use crate::v::zvexx::zvexx_helpers::INSTRUCTION_SIZE;
5use crate::{ExecutionError, ProgramCounter, VirtualMemory, VirtualMemoryError};
6use ab_riscv_primitives::prelude::*;
7use core::cmp::Ordering;
8use core::fmt;
9use core::hint::cold_path;
10
11/// Return whether mask bit `i` is set in the mask byte slice.
12///
13/// Bits are stored LSB-first within each byte: bit `i` is at byte `i / 8`, position `i % 8`.
14/// Returns `false` for any `i` outside the slice bounds.
15#[inline(always)]
16pub(crate) fn mask_bit(mask: &[u8], i: u32) -> bool {
17 mask.get((i / u8::BITS) as usize)
18 .is_some_and(|b| (b >> (i % u8::BITS)) & 1 != 0)
19}
20
21/// Copy the mask bytes needed to cover `vl` elements from `v0` into a stack buffer and return
22/// it. The copy releases the shared borrow on the register file so the caller can immediately
23/// take an exclusive borrow for writes.
24///
25/// When `vm=true` (unmasked), the buffer is filled with `0xff` so that every mask bit reads as `1`.
26/// This means callers can unconditionally call [`mask_bit()`] on the returned buffer without
27/// branching on `vm`. Current callers short-circuit with `!vm &&` before calling [`mask_bit()`] as
28/// a micro-optimization on the common unmasked path, but correctness does not depend on that guard:
29/// if it were removed, the `0xff` fill ensures [`mask_bit()`] would return `true` for every
30/// element, preserving the unmasked semantics.
31///
32/// # Safety
33/// `vl.div_ceil(8)` must be `<= VLENB`. This holds when `vl <= VLEN`, which is always true
34/// when `vl` is the current architectural `vl` (bounded by `VLMAX <= VLEN`).
35#[inline(always)]
36pub(in super::super) unsafe fn snapshot_mask<const VLENB: usize>(
37 vregs: &VectorRegisterFile<VLENB>,
38 vm: bool,
39 vl: u32,
40) -> [u8; VLENB] {
41 let mut buf = [0u8; VLENB];
42 if vm {
43 // All-ones: every element active
44 buf = [0xffu8; VLENB];
45 } else {
46 let mask_bytes = vl.div_ceil(u8::BITS) as usize;
47 // SAFETY: `mask_bytes <= VLENB` by the caller's precondition
48 unsafe {
49 buf.get_unchecked_mut(..mask_bytes)
50 .copy_from_slice(vregs.get(VReg::V0).get_unchecked(..mask_bytes));
51 }
52 }
53 buf
54}
55
56/// Return whether register groups `[a, a+a_regs)` and `[b, b+b_regs)` overlap.
57#[inline(always)]
58#[doc(hidden)]
59pub fn groups_overlap(a: VReg, a_regs: u8, b: VReg, b_regs: u8) -> bool {
60 let (a, b) = (a.to_bits(), b.to_bits());
61 a < b + b_regs && b < a + a_regs
62}
63
64/// Return whether a *non-segment* indexed load's data destination group
65/// `[vd, vd + data_regs)` may legally overlap its index source group `[vs2, vs2 + index_regs)`.
66///
67/// The data EEW equals `sew` (indexed loads take their data width from `vtype.vsew()`) with
68/// `EMUL = LMUL`, whereas the index group has EEW `index_eew` and `EMUL = (index_eew / sew) *
69/// LMUL`. Because the two groups can have different EEW, the general vector register overlap
70/// constraint applies: a destination group may overlap a source group only when one of the
71/// following holds:
72///
73/// - the EEWs are equal (the groups coincide); or
74/// - the destination EEW is smaller and the overlap is in the lowest-numbered part of the source
75/// group, i.e. the destination starts at the source's base register (`vd == vs2`); or
76/// - the destination EEW is larger, the source EMUL is at least one register, and the overlap is in
77/// the highest-numbered part of the destination group, i.e. both groups end at the same register
78/// (`vd + data_regs == vs2 + index_regs`).
79///
80/// Groups that do not overlap at all are always permitted. Any other overlap is reserved.
81///
82/// Unlike indexed *segment* loads (which forbid any `vd`/`vs2` overlap to remain restartable),
83/// these relaxed rules are what allow encodings such as `vluxei32.v v16, (s2), v16` when the data
84/// and index EEW match.
85#[inline(always)]
86#[doc(hidden)]
87pub fn indexed_load_overlap_allowed(
88 vd: VReg,
89 data_regs: u8,
90 vs2: VReg,
91 index_regs: u8,
92 index_eew: Eew,
93 sew: Vsew,
94 vlmul: Vlmul,
95) -> bool {
96 if !groups_overlap(vd, data_regs, vs2, index_regs) {
97 return true;
98 }
99
100 match sew.bytes_width().cmp(&index_eew.bytes_width()) {
101 // Equal EEW: the two groups coincide, overlap is permitted.
102 Ordering::Equal => true,
103 // Smaller data EEW: overlap must be in the lowest-numbered part of the index group, which
104 // (given both groups are alignment-checked) means the data group starts at the index base.
105 Ordering::Less => vd == vs2,
106 // Larger data EEW: overlap must be in the highest-numbered part of the data group, and the
107 // index EMUL must be at least one full register. `index_regs` alone cannot distinguish a
108 // whole-register EMUL from a fractional one clamped to a single register, so the EMUL is
109 // recomputed here as `(index_eew / sew) * LMUL >= 1`.
110 Ordering::Greater => {
111 let (lmul_num, lmul_den) = vlmul.as_fraction();
112 let index_emul_at_least_one = u16::from(index_eew.bits_width()) * u16::from(lmul_num)
113 >= u16::from(sew.bits_width()) * u16::from(lmul_den);
114 let (vd, vs2) = (vd.to_bits(), vs2.to_bits());
115 index_emul_at_least_one && vd + data_regs == vs2 + index_regs
116 }
117 }
118}
119
120/// Check that `vd` is aligned to `group_regs` and that the group fits within `[0, 32)`.
121///
122/// Per spec, the base register of every register group must be a multiple of the group size.
123#[inline(always)]
124#[doc(hidden)]
125pub fn check_register_group_alignment<Reg, Memory, PC, CustomError>(
126 program_counter: &PC,
127 vd: VReg,
128 group_regs: u8,
129) -> Result<(), ExecutionError<Reg::Type, CustomError>>
130where
131 Reg: Register,
132 PC: ProgramCounter<Reg::Type, Memory, CustomError>,
133{
134 let vd = vd.to_bits();
135 if !vd.is_multiple_of(group_regs) || vd + group_regs > 32 {
136 cold_path();
137 return Err(ExecutionError::IllegalInstruction {
138 address: program_counter.old_pc(INSTRUCTION_SIZE),
139 });
140 }
141 Ok(())
142}
143
144/// Validate segment register layout: all `nf` field groups fit within `[0, 32)`, the base
145/// register is group-aligned, and the first field group does not include `v0` when masked.
146///
147/// Field `f` occupies registers `[vd + f * group_regs, vd + f * group_regs + group_regs)`.
148/// On `Ok`, `vd.to_bits() + nf * group_regs <= 32` is guaranteed.
149#[inline(always)]
150#[doc(hidden)]
151pub fn validate_segment_registers<Reg, Memory, PC, CustomError>(
152 program_counter: &PC,
153 vd: VReg,
154 vm: bool,
155 group_regs: u8,
156 nf: Nf,
157) -> Result<(), ExecutionError<Reg::Type, CustomError>>
158where
159 Reg: Register,
160 PC: ProgramCounter<Reg::Type, Memory, CustomError>,
161{
162 let group_regs = u32::from(group_regs);
163 let nf = u32::from(nf.fields_per_segment());
164 let vd_idx = u32::from(vd.to_bits());
165 if vd_idx % group_regs != 0 || vd_idx + nf * group_regs > 32 {
166 cold_path();
167 return Err(ExecutionError::IllegalInstruction {
168 address: program_counter.old_pc(INSTRUCTION_SIZE),
169 });
170 }
171 // When masked, no field group may contain v0 (index 0). Since groups are laid out
172 // contiguously from vd and vd is group-aligned, only the first field (f=0) could contain
173 // v0, which happens exactly when vd == 0.
174 if !vm && vd_idx == 0 {
175 cold_path();
176 return Err(ExecutionError::IllegalInstruction {
177 address: program_counter.old_pc(INSTRUCTION_SIZE),
178 });
179 }
180 Ok(())
181}
182
183/// Read element `elem_i` from register group `[base_reg, base_reg + group_regs)` into a
184/// `[u8; Eew::MAX_BYTES]` buffer.
185///
186/// The in-register position of element `elem_i` is:
187/// - register `base_reg + elem_i / (VLENB / eew.bytes())`
188/// - byte offset `(elem_i % (VLENB / eew.bytes())) * eew.bytes()`
189///
190/// The result is placed in `buf[..eew.bytes()]`; the remaining bytes are zero.
191///
192/// # Safety
193/// `base_reg + elem_i / (VLENB / eew.bytes())` must be less than 32, i.e. `elem_i` must be
194/// a valid element index within the register group.
195#[inline(always)]
196pub(in super::super) unsafe fn read_group_element<const VLENB: usize>(
197 vregs: &VectorRegisterFile<VLENB>,
198 base_reg: VReg,
199 // TODO: `elem_i` here and in other places shouldn't be `u32`
200 elem_i: u32,
201 eew: Eew,
202) -> [u8; Eew::MAX_BYTES as usize] {
203 let elem_bytes = usize::from(eew.bytes_width());
204 let elems_per_reg = VLENB / elem_bytes;
205 let reg_off = elem_i as usize / elems_per_reg;
206 let byte_off = (elem_i as usize % elems_per_reg) * elem_bytes;
207 // SAFETY: `base_reg + reg_off < 32` by the caller's precondition
208 let reg = unsafe {
209 vregs.get(VReg::from_bits(base_reg.to_bits() + reg_off as u8).unwrap_unchecked())
210 };
211 // SAFETY: `byte_off + elem_bytes <= VLENB`: the maximum `byte_off` is
212 // `(elems_per_reg - 1) * elem_bytes = VLENB - elem_bytes`, so
213 // `byte_off + elem_bytes <= VLENB - elem_bytes + elem_bytes = VLENB`.
214 // `elem_bytes <= Eew::MAX_BYTES`: all `Eew` variants are at most E64.
215 let src = unsafe { reg.get_unchecked(byte_off..byte_off + elem_bytes) };
216 let mut buf = [0; _];
217 // SAFETY: `elem_bytes <= Eew::MAX_BYTES` as established above, so `..elem_bytes` is in bounds
218 // for `buf`
219 unsafe { buf.get_unchecked_mut(..elem_bytes) }.copy_from_slice(src);
220 buf
221}
222
223/// Write `eew`-sized data from `buf[..eew.bytes()]` into element `elem_i` of register group
224/// `[base_reg, base_reg + group_regs)`.
225///
226/// The in-register position follows the same layout as [`read_group_element`].
227///
228/// # Safety
229/// `base_reg + elem_i / (VLENB / eew.bytes())` must be less than 32, i.e. `elem_i` must be
230/// a valid element index within the register group.
231#[inline(always)]
232unsafe fn write_group_element<const VLENB: usize>(
233 vregs: &mut VectorRegisterFile<VLENB>,
234 base_reg: VReg,
235 elem_i: u32,
236 eew: Eew,
237 buf: [u8; Eew::MAX_BYTES as usize],
238) {
239 let elem_bytes = usize::from(eew.bytes_width());
240 let elems_per_reg = VLENB / elem_bytes;
241 let reg_off = elem_i as usize / elems_per_reg;
242 let byte_off = (elem_i as usize % elems_per_reg) * elem_bytes;
243 // SAFETY: `base_reg + reg_off < 32` by the caller's precondition
244 let reg = unsafe {
245 vregs.get_mut(VReg::from_bits(base_reg.to_bits() + reg_off as u8).unwrap_unchecked())
246 };
247 // SAFETY: `byte_off + elem_bytes <= VLENB` and `elem_bytes <= Eew::MAX_BYTES`: same argument as
248 // in `read_group_element`
249 let dst = unsafe { reg.get_unchecked_mut(byte_off..byte_off + elem_bytes) };
250 // SAFETY: `elem_bytes <= Eew::MAX_BYTES` as established above, so `..elem_bytes` is in bounds
251 // for `buf`
252 dst.copy_from_slice(unsafe { buf.get_unchecked(..elem_bytes) });
253}
254
255/// Read `eew`-sized data from memory at `addr` into a `[u8; Eew::MAX_BYTES]` buffer
256/// (little-endian)
257#[inline(always)]
258fn read_mem_element(
259 memory: &impl VirtualMemory,
260 addr: u64,
261 eew: Eew,
262) -> Result<[u8; Eew::MAX_BYTES as usize], VirtualMemoryError> {
263 let source = match memory.read_slice(addr, u32::from(eew.bytes_width())) {
264 Ok(source) => source,
265 Err(err) => {
266 cold_path();
267 return Err(err);
268 }
269 };
270 let mut out = [0; _];
271 out[..usize::from(eew.bytes_width())].copy_from_slice(source);
272 Ok(out)
273}
274
275/// Execute a unit-stride or unit-stride segment load (including fault-only-first variants).
276///
277/// Segment stride between elements is `nf * eew.bytes()`. Field `f` for element `i` is at
278/// `base + i * nf * eew.bytes() + f * eew.bytes()`. When `nf == 1` this degenerates to a
279/// plain unit-stride load.
280///
281/// When `fault_only_first` is set: a memory error at element `i > 0` truncates `vl` to `i`
282/// and returns `Ok`. An error at element `0` always propagates.
283///
284/// # Safety
285/// - `vd.to_bits() % group_regs == 0`
286/// - `vd.to_bits() + nf * group_regs <= 32`
287/// - `vl <= group_regs * VLENB / eew.bytes()` (all `vl` elements fit within the destination
288/// register group; this holds when `vl` is the architectural `vl` and `group_regs` is the EMUL
289/// register count for the given `eew` and `vtype`)
290/// - When `vm=false`: `vd` does not overlap `v0` (i.e. `vd.to_bits() != 0`)
291#[inline(always)]
292#[expect(clippy::too_many_arguments, reason = "Internal API")]
293#[doc(hidden)]
294pub unsafe fn execute_unit_stride_load<
295 const FAULT_ONLY_FIRST: bool,
296 Reg,
297 ExtState,
298 Memory,
299 CustomError,
300>(
301 ext_state: &mut ExtState,
302 memory: &Memory,
303 vd: VReg,
304 vm: bool,
305 base: u64,
306 eew: Eew,
307 group_regs: u8,
308 nf: Nf,
309) -> Result<(), ExecutionError<Reg::Type, CustomError>>
310where
311 Reg: Register,
312 ExtState: VectorRegistersExt<Reg, CustomError>,
313 [(); ExtState::ELEN as usize]:,
314 [(); ExtState::VLEN as usize]:,
315 [(); ExtState::VLENB as usize]:,
316 Memory: VirtualMemory,
317 CustomError: fmt::Debug,
318{
319 let vl = ext_state.vl();
320 let vstart = ext_state.vstart();
321 let elem_bytes = eew.bytes_width();
322 let segment_stride = u64::from(nf.fields_per_segment()) * u64::from(elem_bytes);
323
324 // SAFETY: `vl <= VLMAX <= VLEN`, so `vl.div_ceil(8) <= VLENB`.
325 let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
326
327 for i in u32::from(vstart)..vl {
328 if !vm && !mask_bit(&mask_buf, i) {
329 continue;
330 }
331
332 let elem_base = base.wrapping_add(u64::from(i) * segment_stride);
333
334 // Read all nf fields into a stack buffer before writing any of them.
335 // This ensures a fault on field f>0 leaves the destination registers untouched for the
336 // faulting element, so only elements with index new_vl are ever written (fault-only-first
337 // semantics).
338 //
339 // Sized by `Nf::MAX * Eew::MAX_BYTES`: the V spec allows at most 8 fields (nf in 1..=8)
340 // each is at most 8 bytes (E64), giving 64 bytes.
341 let mut field_buf =
342 [[0u8; usize::from(Eew::MAX_BYTES)]; usize::from(Nf::MAX.fields_per_segment())];
343
344 for f in 0..nf.fields_per_segment() {
345 let addr = elem_base.wrapping_add(u64::from(f * elem_bytes));
346 match read_mem_element(memory, addr, eew) {
347 Ok(data) => {
348 // SAFETY: `f < nf` and the precondition on this function requires
349 // `nf <= Nf::MAX` (the V spec encodes nf in 3 bits giving 1..=Nf::MAX, and the
350 // decoder enforces this before constructing the instruction). Therefore, `f as
351 // usize < nf as usize <= Nf::MAX`, which is exactly the length of `field_buf`.
352 unsafe {
353 *field_buf.get_unchecked_mut(f as usize) = data;
354 }
355 }
356 Err(mem_err) => {
357 cold_path();
358 if FAULT_ONLY_FIRST && i > 0 {
359 ext_state.set_vl(i);
360 ext_state.mark_vs_dirty();
361 ext_state.reset_vstart();
362 return Ok(());
363 }
364 if i > u32::from(vstart) {
365 // Elements [vstart, i) were committed; VS is now dirty.
366 ext_state.mark_vs_dirty();
367 // vstart records the faulting element for restartability.
368 ext_state.set_vstart(i as u16);
369 }
370 return Err(ExecutionError::MemoryAccess(mem_err));
371 }
372 }
373 }
374
375 // All nf fields for element i were read successfully; commit to the register file.
376 for f in 0..nf.fields_per_segment() {
377 // SAFETY: Guaranteed by function contract
378 let field_base_reg =
379 unsafe { VReg::from_bits(vd.to_bits() + f * group_regs).unwrap_unchecked() };
380 // SAFETY: need `field_base_reg + i / (VLENB / elem_bytes) < 32`.
381 //
382 // Let `elems_per_reg = VLENB / elem_bytes`.
383 // `i < vl <= group_regs * elems_per_reg` (precondition), so
384 // `i / elems_per_reg < group_regs`.
385 //
386 // `field_base_reg = vd.to_bits() + f * group_regs`. Since `f < nf` and the
387 // precondition guarantees `vd.to_bits() + nf * group_regs <= 32`:
388 // `field_base_reg + group_regs <= vd.to_bits() + (f+1) * group_regs
389 // <= vd.to_bits() + nf * group_regs <= 32`.
390 //
391 // Therefore, `field_base_reg + i / elems_per_reg
392 // < field_base_reg + group_regs <= 32`.
393 //
394 // For `field_buf`: `f < nf <= Nf::MAX` (the same argument as in the read loop
395 // above), so `f as usize < Nf::MAX = field_buf.len()`.
396 unsafe {
397 write_group_element(
398 ext_state.write_vregs(),
399 field_base_reg,
400 i,
401 eew,
402 *field_buf.get_unchecked(f as usize),
403 );
404 }
405 }
406 }
407
408 ext_state.mark_vs_dirty();
409 ext_state.reset_vstart();
410 Ok(())
411}
412
413/// Execute a strided or strided segment load.
414///
415/// `addr[i] = base + i * stride` where `stride` is a signed XLEN-wide value. Field `f` of
416/// element `i` is at `addr[i] + f * eew.bytes()`.
417///
418/// # Safety
419/// - `vd.to_bits() % group_regs == 0`
420/// - `vd.to_bits() + nf * group_regs <= 32`
421/// - `vl <= group_regs * VLENB / eew.bytes()`
422/// - When `vm=false`: `vd` does not overlap `v0` (i.e. `vd.to_bits() != 0`)
423#[inline(always)]
424#[expect(clippy::too_many_arguments, reason = "Internal API")]
425#[doc(hidden)]
426pub unsafe fn execute_strided_load<Reg, ExtState, Memory, CustomError>(
427 ext_state: &mut ExtState,
428 memory: &Memory,
429 vd: VReg,
430 vm: bool,
431 base: u64,
432 stride: i64,
433 eew: Eew,
434 group_regs: u8,
435 nf: Nf,
436) -> Result<(), ExecutionError<Reg::Type, CustomError>>
437where
438 Reg: Register,
439 ExtState: VectorRegistersExt<Reg, CustomError>,
440 [(); ExtState::ELEN as usize]:,
441 [(); ExtState::VLEN as usize]:,
442 [(); ExtState::VLENB as usize]:,
443 Memory: VirtualMemory,
444 CustomError: fmt::Debug,
445{
446 let vl = ext_state.vl();
447 let vstart = ext_state.vstart();
448 let elem_bytes = eew.bytes_width();
449
450 // SAFETY: `vl <= VLMAX <= VLEN` (precondition), so `vl.div_ceil(8) <= VLEN / 8 = VLENB`.
451 let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
452
453 for i in u32::from(vstart)..vl {
454 if !vm && !mask_bit(&mask_buf, i) {
455 continue;
456 }
457
458 let elem_base = base.wrapping_add(i64::from(i).wrapping_mul(stride).cast_unsigned());
459
460 for f in 0..nf.fields_per_segment() {
461 let addr = elem_base.wrapping_add(u64::from(f * elem_bytes));
462 let data = match read_mem_element(memory, addr, eew) {
463 Ok(data) => data,
464 Err(mem_err) => {
465 cold_path();
466 if f > 0 || i > u32::from(vstart) {
467 ext_state.mark_vs_dirty();
468 ext_state.set_vstart(i as u16);
469 }
470 return Err(ExecutionError::MemoryAccess(mem_err));
471 }
472 };
473 // SAFETY: Guaranteed by function contract
474 let field_base_reg =
475 unsafe { VReg::from_bits(vd.to_bits() + f * group_regs).unwrap_unchecked() };
476 // SAFETY: need `field_base_reg + i / (VLENB / elem_bytes) < 32`.
477 //
478 // Let `elems_per_reg = VLENB / elem_bytes`.
479 // `i < vl <= group_regs * elems_per_reg` (precondition), so
480 // `i / elems_per_reg < group_regs`.
481 //
482 // `field_base_reg = vd.to_bits() + f * group_regs`. Since `f < nf` and
483 // `vd.to_bits() + nf * group_regs <= 32` (precondition):
484 // `field_base_reg + group_regs <= vd.to_bits() + (f+1) * group_regs
485 // <= vd.to_bits() + nf * group_regs <= 32`.
486 //
487 // Therefore, `field_base_reg + i / elems_per_reg < field_base_reg + group_regs <= 32`.
488 unsafe {
489 write_group_element(ext_state.write_vregs(), field_base_reg, i, eew, data);
490 }
491 }
492 }
493
494 ext_state.mark_vs_dirty();
495 ext_state.reset_vstart();
496 Ok(())
497}
498
499/// Execute an indexed (unordered or ordered) or indexed segment load.
500///
501/// For element `i`, reads `index_eew`-sized bytes from register group `vs2` at element `i`
502/// to obtain a zero-extended byte offset, then loads `nf` data fields from
503/// `base + offset + f * data_eew.bytes()`. Unordered vs ordered is functionally identical in
504/// a software interpreter.
505///
506/// # Safety
507/// - `vd.to_bits() % data_group_regs == 0`
508/// - `vd.to_bits() + nf * data_group_regs <= 32`
509/// - `vs2.to_bits() + (vl - 1) / (VLENB / index_eew.bytes()) < 32` (all `vl` index elements fit
510/// within the register file; satisfied when `vs2` is alignment-checked against `EMUL_index` and
511/// `vl` is the architectural `vl` bounded by `VLMAX`)
512/// - `vl <= data_group_regs * VLENB / data_eew.bytes()` (all `vl` elements fit in a data group)
513/// - When `vm=false`: `vd` does not overlap `v0` (i.e. `vd.to_bits() != 0`)
514#[inline(always)]
515#[expect(clippy::too_many_arguments, reason = "Internal API")]
516#[doc(hidden)]
517pub unsafe fn execute_indexed_load<Reg, ExtState, Memory, CustomError>(
518 ext_state: &mut ExtState,
519 memory: &Memory,
520 vd: VReg,
521 vs2: VReg,
522 vm: bool,
523 base: u64,
524 data_eew: Eew,
525 index_eew: Eew,
526 data_group_regs: u8,
527 nf: Nf,
528) -> Result<(), ExecutionError<Reg::Type, CustomError>>
529where
530 Reg: Register,
531 ExtState: VectorRegistersExt<Reg, CustomError>,
532 [(); ExtState::ELEN as usize]:,
533 [(); ExtState::VLEN as usize]:,
534 [(); ExtState::VLENB as usize]:,
535 Memory: VirtualMemory,
536 CustomError: fmt::Debug,
537{
538 let vl = ext_state.vl();
539 let vstart = ext_state.vstart();
540 let index_base_reg = vs2;
541
542 // SAFETY: `vl <= VLMAX <= VLEN` (precondition), so `vl.div_ceil(8) <= VLEN / 8 = VLENB`.
543 let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
544
545 for i in u32::from(vstart)..vl {
546 if !vm && !mask_bit(&mask_buf, i) {
547 continue;
548 }
549
550 // SAFETY: need `index_base_reg + i / (VLENB / index_eew.bytes()) < 32`.
551 //
552 // The caller verified `vs2` is aligned to `EMUL_index` registers and that
553 // `vs2.to_bits() + EMUL_index <= 32`. `EMUL_index` is defined so that
554 // `EMUL_index * (VLENB / index_eew.bytes()) = VLMAX`. Since `i < vl <= VLMAX`,
555 // `i / (VLENB / index_eew.bytes()) < EMUL_index`, and therefore
556 // `index_base_reg + i / (VLENB / index_eew.bytes()) < index_base_reg + EMUL_index <= 32`.
557 let index_buf =
558 unsafe { read_group_element(ext_state.read_vregs(), index_base_reg, i, index_eew) };
559 let offset = u64::from_le_bytes(index_buf);
560 let elem_addr = base.wrapping_add(offset);
561
562 let data_elem_bytes = data_eew.bytes_width();
563 for f in 0..nf.fields_per_segment() {
564 let addr = elem_addr.wrapping_add(u64::from(f) * u64::from(data_elem_bytes));
565 let data = match read_mem_element(memory, addr, data_eew) {
566 Ok(data) => data,
567 Err(mem_err) => {
568 cold_path();
569 if f > 0 || i > u32::from(vstart) {
570 ext_state.mark_vs_dirty();
571 ext_state.set_vstart(i as u16);
572 }
573 return Err(ExecutionError::MemoryAccess(mem_err));
574 }
575 };
576 // SAFETY: Guaranteed by function contract
577 let field_base_reg =
578 unsafe { VReg::from_bits(vd.to_bits() + f * data_group_regs).unwrap_unchecked() };
579 // SAFETY: need `field_base_reg + i / (VLENB / data_eew.bytes()) < 32`.
580 //
581 // Let `data_elems_per_reg = VLENB / data_eew.bytes()`.
582 // `i < vl <= data_group_regs * data_elems_per_reg` (precondition), so
583 // `i / data_elems_per_reg < data_group_regs`.
584 //
585 // `field_base_reg = vd.to_bits() + f * data_group_regs`. Since `f < nf` and
586 // `vd.to_bits() + nf * data_group_regs <= 32` (precondition):
587 // `field_base_reg + data_group_regs <= vd.to_bits() + (f+1) * data_group_regs
588 // <= vd.to_bits() + nf * data_group_regs <= 32`.
589 //
590 // Therefore,
591 // `field_base_reg + i / data_elems_per_reg < field_base_reg + data_group_regs <= 32`.
592 unsafe {
593 write_group_element(ext_state.write_vregs(), field_base_reg, i, data_eew, data);
594 }
595 }
596 }
597
598 ext_state.mark_vs_dirty();
599 ext_state.reset_vstart();
600 Ok(())
601}