ab_riscv_interpreter/v/zvexx/load/zvexx_load_helpers.rs
1//! Opaque helpers for ZveXx extension
2
3use crate::v::vector_registers::{VectorRegisterFile, VectorRegistersExt};
4use crate::v::zvexx::zvexx_helpers::INSTRUCTION_SIZE;
5use crate::{ExecutionError, ProgramCounter, VirtualMemory, VirtualMemoryError};
6use ab_riscv_primitives::prelude::*;
7use core::cmp::Ordering;
8use core::fmt;
9
10/// Return whether mask bit `i` is set in the mask byte slice.
11///
12/// Bits are stored LSB-first within each byte: bit `i` is at byte `i / 8`, position `i % 8`.
13/// Returns `false` for any `i` outside the slice bounds.
14#[inline(always)]
15pub(in super::super) fn mask_bit(mask: &[u8], i: u32) -> bool {
16 mask.get((i / u8::BITS) as usize)
17 .is_some_and(|b| (b >> (i % u8::BITS)) & 1 != 0)
18}
19
20/// Copy the mask bytes needed to cover `vl` elements from `v0` into a stack buffer and return
21/// it. The copy releases the shared borrow on the register file so the caller can immediately
22/// take an exclusive borrow for writes.
23///
24/// When `vm=true` (unmasked), the buffer is filled with `0xff` so that every mask bit reads as `1`.
25/// This means callers can unconditionally call [`mask_bit()`] on the returned buffer without
26/// branching on `vm`. Current callers short-circuit with `!vm &&` before calling [`mask_bit()`] as
27/// a micro-optimization on the common unmasked path, but correctness does not depend on that guard:
28/// if it were removed, the `0xff` fill ensures [`mask_bit()`] would return `true` for every
29/// element, preserving the unmasked semantics.
30///
31/// # Safety
32/// `vl.div_ceil(8)` must be `<= VLENB`. This holds when `vl <= VLEN`, which is always true
33/// when `vl` is the current architectural `vl` (bounded by `VLMAX <= VLEN`).
34#[inline(always)]
35pub(in super::super) unsafe fn snapshot_mask<const VLENB: usize>(
36 vregs: &VectorRegisterFile<VLENB>,
37 vm: bool,
38 vl: u32,
39) -> [u8; VLENB] {
40 let mut buf = [0u8; VLENB];
41 if vm {
42 // All-ones: every element active
43 buf = [0xffu8; VLENB];
44 } else {
45 let mask_bytes = vl.div_ceil(u8::BITS) as usize;
46 // SAFETY: `mask_bytes <= VLENB` by the caller's precondition
47 unsafe {
48 buf.get_unchecked_mut(..mask_bytes)
49 .copy_from_slice(vregs.get(VReg::V0).get_unchecked(..mask_bytes));
50 }
51 }
52 buf
53}
54
55/// Return whether register groups `[a, a+a_regs)` and `[b, b+b_regs)` overlap.
56#[inline(always)]
57#[doc(hidden)]
58pub fn groups_overlap(a: VReg, a_regs: u8, b: VReg, b_regs: u8) -> bool {
59 let (a, b) = (a.to_bits(), b.to_bits());
60 a < b + b_regs && b < a + a_regs
61}
62
63/// Return whether a *non-segment* indexed load's data destination group
64/// `[vd, vd + data_regs)` may legally overlap its index source group `[vs2, vs2 + index_regs)`.
65///
66/// The data EEW equals `sew` (indexed loads take their data width from `vtype.vsew()`) with
67/// `EMUL = LMUL`, whereas the index group has EEW `index_eew` and `EMUL = (index_eew / sew) *
68/// LMUL`. Because the two groups can have different EEW, the general vector register overlap
69/// constraint applies: a destination group may overlap a source group only when one of the
70/// following holds:
71///
72/// - the EEWs are equal (the groups coincide); or
73/// - the destination EEW is smaller and the overlap is in the lowest-numbered part of the source
74/// group, i.e. the destination starts at the source's base register (`vd == vs2`); or
75/// - the destination EEW is larger, the source EMUL is at least one register, and the overlap is in
76/// the highest-numbered part of the destination group, i.e. both groups end at the same register
77/// (`vd + data_regs == vs2 + index_regs`).
78///
79/// Groups that do not overlap at all are always permitted. Any other overlap is reserved.
80///
81/// Unlike indexed *segment* loads (which forbid any `vd`/`vs2` overlap to remain restartable),
82/// these relaxed rules are what allow encodings such as `vluxei32.v v16, (s2), v16` when the data
83/// and index EEW match.
84#[inline(always)]
85#[doc(hidden)]
86pub fn indexed_load_overlap_allowed(
87 vd: VReg,
88 data_regs: u8,
89 vs2: VReg,
90 index_regs: u8,
91 index_eew: Eew,
92 sew: Vsew,
93 vlmul: Vlmul,
94) -> bool {
95 if !groups_overlap(vd, data_regs, vs2, index_regs) {
96 return true;
97 }
98
99 match sew.bytes_width().cmp(&index_eew.bytes_width()) {
100 // Equal EEW: the two groups coincide, overlap is permitted.
101 Ordering::Equal => true,
102 // Smaller data EEW: overlap must be in the lowest-numbered part of the index group, which
103 // (given both groups are alignment-checked) means the data group starts at the index base.
104 Ordering::Less => vd == vs2,
105 // Larger data EEW: overlap must be in the highest-numbered part of the data group, and the
106 // index EMUL must be at least one full register. `index_regs` alone cannot distinguish a
107 // whole-register EMUL from a fractional one clamped to a single register, so the EMUL is
108 // recomputed here as `(index_eew / sew) * LMUL >= 1`.
109 Ordering::Greater => {
110 let (lmul_num, lmul_den) = vlmul.as_fraction();
111 let index_emul_at_least_one = u16::from(index_eew.bits_width()) * u16::from(lmul_num)
112 >= u16::from(sew.bits_width()) * u16::from(lmul_den);
113 let (vd, vs2) = (vd.to_bits(), vs2.to_bits());
114 index_emul_at_least_one && vd + data_regs == vs2 + index_regs
115 }
116 }
117}
118
119/// Check that `vd` is aligned to `group_regs` and that the group fits within `[0, 32)`.
120///
121/// Per spec, the base register of every register group must be a multiple of the group size.
122#[inline(always)]
123#[doc(hidden)]
124pub fn check_register_group_alignment<Reg, Memory, PC, CustomError>(
125 program_counter: &PC,
126 vd: VReg,
127 group_regs: u8,
128) -> Result<(), ExecutionError<Reg::Type, CustomError>>
129where
130 Reg: Register,
131 PC: ProgramCounter<Reg::Type, Memory, CustomError>,
132{
133 let vd = vd.to_bits();
134 if !vd.is_multiple_of(group_regs) || vd + group_regs > 32 {
135 return Err(ExecutionError::IllegalInstruction {
136 address: program_counter.old_pc(INSTRUCTION_SIZE),
137 });
138 }
139 Ok(())
140}
141
142/// Validate segment register layout: all `nf` field groups fit within `[0, 32)`, the base
143/// register is group-aligned, and the first field group does not include `v0` when masked.
144///
145/// Field `f` occupies registers `[vd + f * group_regs, vd + f * group_regs + group_regs)`.
146/// On `Ok`, `vd.to_bits() + nf * group_regs <= 32` is guaranteed.
147#[inline(always)]
148#[doc(hidden)]
149pub fn validate_segment_registers<Reg, Memory, PC, CustomError>(
150 program_counter: &PC,
151 vd: VReg,
152 vm: bool,
153 group_regs: u8,
154 nf: Nf,
155) -> Result<(), ExecutionError<Reg::Type, CustomError>>
156where
157 Reg: Register,
158 PC: ProgramCounter<Reg::Type, Memory, CustomError>,
159{
160 let group_regs = u32::from(group_regs);
161 let nf = u32::from(nf.fields_per_segment());
162 let vd_idx = u32::from(vd.to_bits());
163 if vd_idx % group_regs != 0 || vd_idx + nf * group_regs > 32 {
164 return Err(ExecutionError::IllegalInstruction {
165 address: program_counter.old_pc(INSTRUCTION_SIZE),
166 });
167 }
168 // When masked, no field group may contain v0 (index 0). Since groups are laid out
169 // contiguously from vd and vd is group-aligned, only the first field (f=0) could contain
170 // v0, which happens exactly when vd == 0.
171 if !vm && vd_idx == 0 {
172 return Err(ExecutionError::IllegalInstruction {
173 address: program_counter.old_pc(INSTRUCTION_SIZE),
174 });
175 }
176 Ok(())
177}
178
179/// Read element `elem_i` from register group `[base_reg, base_reg + group_regs)` into a
180/// `[u8; Eew::MAX_BYTES]` buffer.
181///
182/// The in-register position of element `elem_i` is:
183/// - register `base_reg + elem_i / (VLENB / eew.bytes())`
184/// - byte offset `(elem_i % (VLENB / eew.bytes())) * eew.bytes()`
185///
186/// The result is placed in `buf[..eew.bytes()]`; the remaining bytes are zero.
187///
188/// # Safety
189/// `base_reg + elem_i / (VLENB / eew.bytes())` must be less than 32, i.e. `elem_i` must be
190/// a valid element index within the register group.
191#[inline(always)]
192pub(in super::super) unsafe fn read_group_element<const VLENB: usize>(
193 vregs: &VectorRegisterFile<VLENB>,
194 base_reg: VReg,
195 // TODO: `elem_i` here and in other places shouldn't be `u32`
196 elem_i: u32,
197 eew: Eew,
198) -> [u8; Eew::MAX_BYTES as usize] {
199 let elem_bytes = usize::from(eew.bytes_width());
200 let elems_per_reg = VLENB / elem_bytes;
201 let reg_off = elem_i as usize / elems_per_reg;
202 let byte_off = (elem_i as usize % elems_per_reg) * elem_bytes;
203 // SAFETY: `base_reg + reg_off < 32` by the caller's precondition
204 let reg = unsafe {
205 vregs.get(VReg::from_bits(base_reg.to_bits() + reg_off as u8).unwrap_unchecked())
206 };
207 // SAFETY: `byte_off + elem_bytes <= VLENB`: the maximum `byte_off` is
208 // `(elems_per_reg - 1) * elem_bytes = VLENB - elem_bytes`, so
209 // `byte_off + elem_bytes <= VLENB - elem_bytes + elem_bytes = VLENB`.
210 // `elem_bytes <= Eew::MAX_BYTES`: all `Eew` variants are at most E64.
211 let src = unsafe { reg.get_unchecked(byte_off..byte_off + elem_bytes) };
212 let mut buf = [0; _];
213 // SAFETY: `elem_bytes <= Eew::MAX_BYTES` as established above, so `..elem_bytes` is in bounds
214 // for `buf`
215 unsafe { buf.get_unchecked_mut(..elem_bytes) }.copy_from_slice(src);
216 buf
217}
218
219/// Write `eew`-sized data from `buf[..eew.bytes()]` into element `elem_i` of register group
220/// `[base_reg, base_reg + group_regs)`.
221///
222/// The in-register position follows the same layout as [`read_group_element`].
223///
224/// # Safety
225/// `base_reg + elem_i / (VLENB / eew.bytes())` must be less than 32, i.e. `elem_i` must be
226/// a valid element index within the register group.
227#[inline(always)]
228unsafe fn write_group_element<const VLENB: usize>(
229 vregs: &mut VectorRegisterFile<VLENB>,
230 base_reg: VReg,
231 elem_i: u32,
232 eew: Eew,
233 buf: [u8; Eew::MAX_BYTES as usize],
234) {
235 let elem_bytes = usize::from(eew.bytes_width());
236 let elems_per_reg = VLENB / elem_bytes;
237 let reg_off = elem_i as usize / elems_per_reg;
238 let byte_off = (elem_i as usize % elems_per_reg) * elem_bytes;
239 // SAFETY: `base_reg + reg_off < 32` by the caller's precondition
240 let reg = unsafe {
241 vregs.get_mut(VReg::from_bits(base_reg.to_bits() + reg_off as u8).unwrap_unchecked())
242 };
243 // SAFETY: `byte_off + elem_bytes <= VLENB` and `elem_bytes <= Eew::MAX_BYTES`: same argument as
244 // in `read_group_element`
245 let dst = unsafe { reg.get_unchecked_mut(byte_off..byte_off + elem_bytes) };
246 // SAFETY: `elem_bytes <= Eew::MAX_BYTES` as established above, so `..elem_bytes` is in bounds
247 // for `buf`
248 dst.copy_from_slice(unsafe { buf.get_unchecked(..elem_bytes) });
249}
250
251/// Read `eew`-sized data from memory at `addr` into a `[u8; Eew::MAX_BYTES]` buffer
252/// (little-endian)
253#[inline(always)]
254fn read_mem_element(
255 memory: &impl VirtualMemory,
256 addr: u64,
257 eew: Eew,
258) -> Result<[u8; Eew::MAX_BYTES as usize], VirtualMemoryError> {
259 let mut out = [0; _];
260 out[..usize::from(eew.bytes_width())]
261 .copy_from_slice(memory.read_slice(addr, u32::from(eew.bytes_width()))?);
262 Ok(out)
263}
264
265/// Execute a unit-stride or unit-stride segment load (including fault-only-first variants).
266///
267/// Segment stride between elements is `nf * eew.bytes()`. Field `f` for element `i` is at
268/// `base + i * nf * eew.bytes() + f * eew.bytes()`. When `nf == 1` this degenerates to a
269/// plain unit-stride load.
270///
271/// When `fault_only_first` is set: a memory error at element `i > 0` truncates `vl` to `i`
272/// and returns `Ok`. An error at element `0` always propagates.
273///
274/// # Safety
275/// - `vd.to_bits() % group_regs == 0`
276/// - `vd.to_bits() + nf * group_regs <= 32`
277/// - `vl <= group_regs * VLENB / eew.bytes()` (all `vl` elements fit within the destination
278/// register group; this holds when `vl` is the architectural `vl` and `group_regs` is the EMUL
279/// register count for the given `eew` and `vtype`)
280/// - When `vm=false`: `vd` does not overlap `v0` (i.e. `vd.to_bits() != 0`)
281#[inline(always)]
282#[expect(clippy::too_many_arguments, reason = "Internal API")]
283#[doc(hidden)]
284pub unsafe fn execute_unit_stride_load<Reg, ExtState, Memory, CustomError>(
285 ext_state: &mut ExtState,
286 memory: &Memory,
287 vd: VReg,
288 vm: bool,
289 base: u64,
290 eew: Eew,
291 group_regs: u8,
292 nf: Nf,
293 fault_only_first: bool,
294) -> Result<(), ExecutionError<Reg::Type, CustomError>>
295where
296 Reg: Register,
297 ExtState: VectorRegistersExt<Reg, CustomError>,
298 [(); ExtState::ELEN as usize]:,
299 [(); ExtState::VLEN as usize]:,
300 [(); ExtState::VLENB as usize]:,
301 Memory: VirtualMemory,
302 CustomError: fmt::Debug,
303{
304 let vl = ext_state.vl();
305 let vstart = ext_state.vstart();
306 let elem_bytes = eew.bytes_width();
307 let segment_stride = u64::from(nf.fields_per_segment()) * u64::from(elem_bytes);
308
309 // SAFETY: `vl <= VLMAX <= VLEN`, so `vl.div_ceil(8) <= VLENB`.
310 let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
311
312 for i in u32::from(vstart)..vl {
313 if !vm && !mask_bit(&mask_buf, i) {
314 continue;
315 }
316
317 let elem_base = base.wrapping_add(u64::from(i) * segment_stride);
318
319 // Read all nf fields into a stack buffer before writing any of them.
320 // This ensures a fault on field f>0 leaves the destination registers untouched for the
321 // faulting element, so only elements with index new_vl are ever written (fault-only-first
322 // semantics).
323 //
324 // Sized by `Nf::MAX * Eew::MAX_BYTES`: the V spec allows at most 8 fields (nf in 1..=8)
325 // each is at most 8 bytes (E64), giving 64 bytes.
326 let mut field_buf =
327 [[0u8; usize::from(Eew::MAX_BYTES)]; usize::from(Nf::MAX.fields_per_segment())];
328
329 for f in 0..nf.fields_per_segment() {
330 let addr = elem_base.wrapping_add(u64::from(f * elem_bytes));
331 match read_mem_element(memory, addr, eew) {
332 Ok(data) => {
333 // SAFETY: `f < nf` and the precondition on this function requires
334 // `nf <= Nf::MAX` (the V spec encodes nf in 3 bits giving 1..=Nf::MAX, and the
335 // decoder enforces this before constructing the instruction). Therefore, `f as
336 // usize < nf as usize <= Nf::MAX`, which is exactly the length of `field_buf`.
337 unsafe {
338 *field_buf.get_unchecked_mut(f as usize) = data;
339 }
340 }
341 Err(mem_err) => {
342 if fault_only_first && i > 0 {
343 ext_state.set_vl(i);
344 ext_state.mark_vs_dirty();
345 ext_state.reset_vstart();
346 return Ok(());
347 }
348 if i > u32::from(vstart) {
349 // Elements [vstart, i) were committed; VS is now dirty.
350 ext_state.mark_vs_dirty();
351 // vstart records the faulting element for restartability.
352 ext_state.set_vstart(i as u16);
353 }
354 return Err(ExecutionError::MemoryAccess(mem_err));
355 }
356 }
357 }
358
359 // All nf fields for element i were read successfully; commit to the register file.
360 for f in 0..nf.fields_per_segment() {
361 // SAFETY: Guaranteed by function contract
362 let field_base_reg =
363 unsafe { VReg::from_bits(vd.to_bits() + f * group_regs).unwrap_unchecked() };
364 // SAFETY: need `field_base_reg + i / (VLENB / elem_bytes) < 32`.
365 //
366 // Let `elems_per_reg = VLENB / elem_bytes`.
367 // `i < vl <= group_regs * elems_per_reg` (precondition), so
368 // `i / elems_per_reg < group_regs`.
369 //
370 // `field_base_reg = vd.to_bits() + f * group_regs`. Since `f < nf` and the
371 // precondition guarantees `vd.to_bits() + nf * group_regs <= 32`:
372 // `field_base_reg + group_regs <= vd.to_bits() + (f+1) * group_regs
373 // <= vd.to_bits() + nf * group_regs <= 32`.
374 //
375 // Therefore, `field_base_reg + i / elems_per_reg
376 // < field_base_reg + group_regs <= 32`.
377 //
378 // For `field_buf`: `f < nf <= Nf::MAX` (the same argument as in the read loop
379 // above), so `f as usize < Nf::MAX = field_buf.len()`.
380 unsafe {
381 write_group_element(
382 ext_state.write_vregs(),
383 field_base_reg,
384 i,
385 eew,
386 *field_buf.get_unchecked(f as usize),
387 );
388 }
389 }
390 }
391
392 ext_state.mark_vs_dirty();
393 ext_state.reset_vstart();
394 Ok(())
395}
396
397/// Execute a strided or strided segment load.
398///
399/// `addr[i] = base + i * stride` where `stride` is a signed XLEN-wide value. Field `f` of
400/// element `i` is at `addr[i] + f * eew.bytes()`.
401///
402/// # Safety
403/// - `vd.to_bits() % group_regs == 0`
404/// - `vd.to_bits() + nf * group_regs <= 32`
405/// - `vl <= group_regs * VLENB / eew.bytes()`
406/// - When `vm=false`: `vd` does not overlap `v0` (i.e. `vd.to_bits() != 0`)
407#[inline(always)]
408#[expect(clippy::too_many_arguments, reason = "Internal API")]
409#[doc(hidden)]
410pub unsafe fn execute_strided_load<Reg, ExtState, Memory, CustomError>(
411 ext_state: &mut ExtState,
412 memory: &Memory,
413 vd: VReg,
414 vm: bool,
415 base: u64,
416 stride: i64,
417 eew: Eew,
418 group_regs: u8,
419 nf: Nf,
420) -> Result<(), ExecutionError<Reg::Type, CustomError>>
421where
422 Reg: Register,
423 ExtState: VectorRegistersExt<Reg, CustomError>,
424 [(); ExtState::ELEN as usize]:,
425 [(); ExtState::VLEN as usize]:,
426 [(); ExtState::VLENB as usize]:,
427 Memory: VirtualMemory,
428 CustomError: fmt::Debug,
429{
430 let vl = ext_state.vl();
431 let vstart = ext_state.vstart();
432 let elem_bytes = eew.bytes_width();
433
434 // SAFETY: `vl <= VLMAX <= VLEN` (precondition), so `vl.div_ceil(8) <= VLEN / 8 = VLENB`.
435 let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
436
437 for i in u32::from(vstart)..vl {
438 if !vm && !mask_bit(&mask_buf, i) {
439 continue;
440 }
441
442 let elem_base = base.wrapping_add(i64::from(i).wrapping_mul(stride).cast_unsigned());
443
444 for f in 0..nf.fields_per_segment() {
445 let addr = elem_base.wrapping_add(u64::from(f * elem_bytes));
446 let data = match read_mem_element(memory, addr, eew) {
447 Ok(data) => data,
448 Err(mem_err) => {
449 if f > 0 || i > u32::from(vstart) {
450 ext_state.mark_vs_dirty();
451 ext_state.set_vstart(i as u16);
452 }
453 return Err(ExecutionError::MemoryAccess(mem_err));
454 }
455 };
456 // SAFETY: Guaranteed by function contract
457 let field_base_reg =
458 unsafe { VReg::from_bits(vd.to_bits() + f * group_regs).unwrap_unchecked() };
459 // SAFETY: need `field_base_reg + i / (VLENB / elem_bytes) < 32`.
460 //
461 // Let `elems_per_reg = VLENB / elem_bytes`.
462 // `i < vl <= group_regs * elems_per_reg` (precondition), so
463 // `i / elems_per_reg < group_regs`.
464 //
465 // `field_base_reg = vd.to_bits() + f * group_regs`. Since `f < nf` and
466 // `vd.to_bits() + nf * group_regs <= 32` (precondition):
467 // `field_base_reg + group_regs <= vd.to_bits() + (f+1) * group_regs
468 // <= vd.to_bits() + nf * group_regs <= 32`.
469 //
470 // Therefore, `field_base_reg + i / elems_per_reg < field_base_reg + group_regs <= 32`.
471 unsafe {
472 write_group_element(ext_state.write_vregs(), field_base_reg, i, eew, data);
473 }
474 }
475 }
476
477 ext_state.mark_vs_dirty();
478 ext_state.reset_vstart();
479 Ok(())
480}
481
482/// Execute an indexed (unordered or ordered) or indexed segment load.
483///
484/// For element `i`, reads `index_eew`-sized bytes from register group `vs2` at element `i`
485/// to obtain a zero-extended byte offset, then loads `nf` data fields from
486/// `base + offset + f * data_eew.bytes()`. Unordered vs ordered is functionally identical in
487/// a software interpreter.
488///
489/// # Safety
490/// - `vd.to_bits() % data_group_regs == 0`
491/// - `vd.to_bits() + nf * data_group_regs <= 32`
492/// - `vs2.to_bits() + (vl - 1) / (VLENB / index_eew.bytes()) < 32` (all `vl` index elements fit
493/// within the register file; satisfied when `vs2` is alignment-checked against `EMUL_index` and
494/// `vl` is the architectural `vl` bounded by `VLMAX`)
495/// - `vl <= data_group_regs * VLENB / data_eew.bytes()` (all `vl` elements fit in a data group)
496/// - When `vm=false`: `vd` does not overlap `v0` (i.e. `vd.to_bits() != 0`)
497#[inline(always)]
498#[expect(clippy::too_many_arguments, reason = "Internal API")]
499#[doc(hidden)]
500pub unsafe fn execute_indexed_load<Reg, ExtState, Memory, CustomError>(
501 ext_state: &mut ExtState,
502 memory: &Memory,
503 vd: VReg,
504 vs2: VReg,
505 vm: bool,
506 base: u64,
507 data_eew: Eew,
508 index_eew: Eew,
509 data_group_regs: u8,
510 nf: Nf,
511) -> Result<(), ExecutionError<Reg::Type, CustomError>>
512where
513 Reg: Register,
514 ExtState: VectorRegistersExt<Reg, CustomError>,
515 [(); ExtState::ELEN as usize]:,
516 [(); ExtState::VLEN as usize]:,
517 [(); ExtState::VLENB as usize]:,
518 Memory: VirtualMemory,
519 CustomError: fmt::Debug,
520{
521 let vl = ext_state.vl();
522 let vstart = ext_state.vstart();
523 let index_base_reg = vs2;
524
525 // SAFETY: `vl <= VLMAX <= VLEN` (precondition), so `vl.div_ceil(8) <= VLEN / 8 = VLENB`.
526 let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
527
528 for i in u32::from(vstart)..vl {
529 if !vm && !mask_bit(&mask_buf, i) {
530 continue;
531 }
532
533 // SAFETY: need `index_base_reg + i / (VLENB / index_eew.bytes()) < 32`.
534 //
535 // The caller verified `vs2` is aligned to `EMUL_index` registers and that
536 // `vs2.to_bits() + EMUL_index <= 32`. `EMUL_index` is defined so that
537 // `EMUL_index * (VLENB / index_eew.bytes()) = VLMAX`. Since `i < vl <= VLMAX`,
538 // `i / (VLENB / index_eew.bytes()) < EMUL_index`, and therefore
539 // `index_base_reg + i / (VLENB / index_eew.bytes()) < index_base_reg + EMUL_index <= 32`.
540 let index_buf =
541 unsafe { read_group_element(ext_state.read_vregs(), index_base_reg, i, index_eew) };
542 let offset = u64::from_le_bytes(index_buf);
543 let elem_addr = base.wrapping_add(offset);
544
545 let data_elem_bytes = data_eew.bytes_width();
546 for f in 0..nf.fields_per_segment() {
547 let addr = elem_addr.wrapping_add(u64::from(f) * u64::from(data_elem_bytes));
548 let data = match read_mem_element(memory, addr, data_eew) {
549 Ok(data) => data,
550 Err(mem_err) => {
551 if f > 0 || i > u32::from(vstart) {
552 ext_state.mark_vs_dirty();
553 ext_state.set_vstart(i as u16);
554 }
555 return Err(ExecutionError::MemoryAccess(mem_err));
556 }
557 };
558 // SAFETY: Guaranteed by function contract
559 let field_base_reg =
560 unsafe { VReg::from_bits(vd.to_bits() + f * data_group_regs).unwrap_unchecked() };
561 // SAFETY: need `field_base_reg + i / (VLENB / data_eew.bytes()) < 32`.
562 //
563 // Let `data_elems_per_reg = VLENB / data_eew.bytes()`.
564 // `i < vl <= data_group_regs * data_elems_per_reg` (precondition), so
565 // `i / data_elems_per_reg < data_group_regs`.
566 //
567 // `field_base_reg = vd.to_bits() + f * data_group_regs`. Since `f < nf` and
568 // `vd.to_bits() + nf * data_group_regs <= 32` (precondition):
569 // `field_base_reg + data_group_regs <= vd.to_bits() + (f+1) * data_group_regs
570 // <= vd.to_bits() + nf * data_group_regs <= 32`.
571 //
572 // Therefore,
573 // `field_base_reg + i / data_elems_per_reg < field_base_reg + data_group_regs <= 32`.
574 unsafe {
575 write_group_element(ext_state.write_vregs(), field_base_reg, i, data_eew, data);
576 }
577 }
578 }
579
580 ext_state.mark_vs_dirty();
581 ext_state.reset_vstart();
582 Ok(())
583}