Skip to main content

ab_riscv_interpreter/v/zve64x/load/
zve64x_load_helpers.rs

1//! Opaque helpers for Zve64x extension
2
3use crate::v::vector_registers::VectorRegistersExt;
4use crate::v::zve64x::zve64x_helpers::INSTRUCTION_SIZE;
5use crate::{ExecutionError, InterpreterState, ProgramCounter, VirtualMemory, VirtualMemoryError};
6use ab_riscv_primitives::instructions::v::Eew;
7use ab_riscv_primitives::registers::general_purpose::Register;
8use ab_riscv_primitives::registers::vector::VReg;
9use core::fmt;
10
11#[doc(hidden)]
12pub const MAX_NF: u8 = 8;
13
14/// Return whether mask bit `i` is set in the mask byte slice.
15///
16/// Bits are stored LSB-first within each byte: bit `i` is at byte `i / 8`, position `i % 8`.
17/// Returns `false` for any `i` outside the slice bounds.
18#[inline(always)]
19pub(in super::super) fn mask_bit(mask: &[u8], i: u32) -> bool {
20    mask.get((i / u8::BITS) as usize)
21        .is_some_and(|b| (b >> (i % u8::BITS)) & 1 != 0)
22}
23
24/// Copy the mask bytes needed to cover `vl` elements from `v0` into a stack buffer and return
25/// it. The copy releases the shared borrow on the register file so the caller can immediately
26/// take an exclusive borrow for writes.
27///
28/// When `vm=true` (unmasked), the buffer is filled with `0xff` so that every mask bit reads as `1`.
29/// This means callers can unconditionally call [`mask_bit()`] on the returned buffer without
30/// branching on `vm`. Current callers short-circuit with `!vm &&` before calling [`mask_bit()`] as
31/// a micro-optimization on the common unmasked path, but correctness does not depend on that guard:
32/// if it were removed, the `0xff` fill ensures [`mask_bit()`] would return `true` for every
33/// element, preserving the unmasked semantics.
34///
35/// # Safety
36/// `vl.div_ceil(8)` must be `<= VLENB`. This holds when `vl <= VLEN`, which is always true
37/// when `vl` is the current architectural `vl` (bounded by `VLMAX <= VLEN`).
38#[inline(always)]
39pub(in super::super) unsafe fn snapshot_mask<const VLENB: usize>(
40    vreg: &[[u8; VLENB]; 32],
41    vm: bool,
42    vl: u32,
43) -> [u8; VLENB] {
44    let mut buf = [0u8; VLENB];
45    if vm {
46        // All-ones: every element active
47        buf = [0xffu8; VLENB];
48    } else {
49        let mask_bytes = vl.div_ceil(u8::BITS) as usize;
50        // SAFETY: `mask_bytes <= VLENB` by the caller's precondition
51        unsafe {
52            buf.get_unchecked_mut(..mask_bytes)
53                .copy_from_slice(vreg[usize::from(VReg::V0.bits())].get_unchecked(..mask_bytes));
54        }
55    }
56    buf
57}
58
59/// Return whether register groups `[a, a+a_regs)` and `[b, b+b_regs)` overlap.
60#[inline(always)]
61#[doc(hidden)]
62pub fn groups_overlap(a: VReg, a_regs: u8, b: VReg, b_regs: u8) -> bool {
63    let (a, b) = (a.bits(), b.bits());
64    a < b + b_regs && b < a + a_regs
65}
66
67/// Check that `vd` is aligned to `group_regs` and that the group fits within `[0, 32)`.
68///
69/// Per spec, the base register of every register group must be a multiple of the group size.
70#[inline(always)]
71#[doc(hidden)]
72pub fn check_register_group_alignment<Reg, ExtState, Memory, PC, IH, CustomError>(
73    state: &InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
74    vd: VReg,
75    group_regs: u8,
76) -> Result<(), ExecutionError<Reg::Type, CustomError>>
77where
78    Reg: Register,
79    [(); Reg::N]:,
80    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
81{
82    let vd = vd.bits();
83    if !vd.is_multiple_of(group_regs) || vd + group_regs > 32 {
84        return Err(ExecutionError::IllegalInstruction {
85            address: state.instruction_fetcher.old_pc(INSTRUCTION_SIZE),
86        });
87    }
88    Ok(())
89}
90
91/// Validate segment register layout: all `nf` field groups fit within `[0, 32)`, the base
92/// register is group-aligned, and the first field group does not include `v0` when masked.
93///
94/// Field `f` occupies registers `[vd + f * group_regs, vd + f * group_regs + group_regs)`.
95/// On `Ok`, `vd.bits() + nf * group_regs <= 32` is guaranteed.
96#[inline(always)]
97#[doc(hidden)]
98pub fn validate_segment_registers<Reg, ExtState, Memory, PC, IH, CustomError>(
99    state: &InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
100    vd: VReg,
101    vm: bool,
102    group_regs: u8,
103    nf: u8,
104) -> Result<(), ExecutionError<Reg::Type, CustomError>>
105where
106    Reg: Register,
107    [(); Reg::N]:,
108    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
109{
110    let group_regs = u32::from(group_regs);
111    let nf = u32::from(nf);
112    let vd_idx = u32::from(vd.bits());
113    if vd_idx % group_regs != 0 || vd_idx + nf * group_regs > 32 {
114        return Err(ExecutionError::IllegalInstruction {
115            address: state.instruction_fetcher.old_pc(INSTRUCTION_SIZE),
116        });
117    }
118    // When masked, no field group may contain v0 (index 0). Since groups are laid out
119    // contiguously from vd and vd is group-aligned, only the first field (f=0) could contain
120    // v0, which happens exactly when vd == 0.
121    if !vm && vd_idx == 0 {
122        return Err(ExecutionError::IllegalInstruction {
123            address: state.instruction_fetcher.old_pc(INSTRUCTION_SIZE),
124        });
125    }
126    Ok(())
127}
128
129/// Read element `elem_i` from register group `[base_reg, base_reg + group_regs)` into a
130/// `[u8; Eew::MAX_BYTES]` buffer.
131///
132/// The in-register position of element `elem_i` is:
133///   - register `base_reg + elem_i / (VLENB / eew.bytes())`
134///   - byte offset `(elem_i % (VLENB / eew.bytes())) * eew.bytes()`
135///
136/// The result is placed in `buf[..eew.bytes()]`; the remaining bytes are zero.
137///
138/// # Safety
139/// `base_reg + elem_i / (VLENB / eew.bytes())` must be less than 32, i.e. `elem_i` must be
140/// a valid element index within the register group.
141#[inline(always)]
142pub(in super::super) unsafe fn read_group_element<const VLENB: usize>(
143    vreg: &[[u8; VLENB]; 32],
144    base_reg: usize,
145    elem_i: u32,
146    eew: Eew,
147) -> [u8; Eew::MAX_BYTES as usize] {
148    let elem_bytes = usize::from(eew.bytes());
149    let elems_per_reg = VLENB / elem_bytes;
150    let reg_off = elem_i as usize / elems_per_reg;
151    let byte_off = (elem_i as usize % elems_per_reg) * elem_bytes;
152    // SAFETY: `base_reg + reg_off < 32` by the caller's precondition.
153    let reg = unsafe { vreg.get_unchecked(base_reg + reg_off) };
154    // SAFETY: `byte_off + elem_bytes <= VLENB`: the maximum `byte_off` is
155    // `(elems_per_reg - 1) * elem_bytes = VLENB - elem_bytes`, so
156    // `byte_off + elem_bytes <= VLENB - elem_bytes + elem_bytes = VLENB`.
157    // `elem_bytes <= Eew::MAX_BYTES`: all `Eew` variants are at most E64.
158    let src = unsafe { reg.get_unchecked(byte_off..byte_off + elem_bytes) };
159    let mut buf = [0; _];
160    // SAFETY: `elem_bytes <= Eew::MAX_BYTES` as established above, so `..elem_bytes` is in bounds
161    // for `buf`
162    unsafe { buf.get_unchecked_mut(..elem_bytes) }.copy_from_slice(src);
163    buf
164}
165
166/// Write `eew`-sized data from `buf[..eew.bytes()]` into element `elem_i` of register group
167/// `[base_reg, base_reg + group_regs)`.
168///
169/// The in-register position follows the same layout as [`read_group_element`].
170///
171/// # Safety
172/// `base_reg + elem_i / (VLENB / eew.bytes())` must be less than 32, i.e. `elem_i` must be
173/// a valid element index within the register group.
174#[inline(always)]
175unsafe fn write_group_element<const VLENB: usize>(
176    vreg: &mut [[u8; VLENB]; 32],
177    base_reg: u8,
178    elem_i: u32,
179    eew: Eew,
180    buf: [u8; Eew::MAX_BYTES as usize],
181) {
182    let elem_bytes = usize::from(eew.bytes());
183    let elems_per_reg = VLENB / elem_bytes;
184    let reg_off = elem_i as usize / elems_per_reg;
185    let byte_off = (elem_i as usize % elems_per_reg) * elem_bytes;
186    // SAFETY: `base_reg + reg_off < 32` by the caller's precondition
187    let reg = unsafe { vreg.get_unchecked_mut(usize::from(base_reg) + reg_off) };
188    // SAFETY: `byte_off + elem_bytes <= VLENB` and `elem_bytes <= Eew::MAX_BYTES`: same argument as
189    // in `read_group_element`
190    let dst = unsafe { reg.get_unchecked_mut(byte_off..byte_off + elem_bytes) };
191    // SAFETY: `elem_bytes <= Eew::MAX_BYTES` as established above, so `..elem_bytes` is in bounds
192    // for `buf`
193    dst.copy_from_slice(unsafe { buf.get_unchecked(..elem_bytes) });
194}
195
196/// Read `eew`-sized data from memory at `addr` into a `[u8; Eew::MAX_BYTES]` buffer
197/// (little-endian)
198#[inline(always)]
199fn read_mem_element(
200    memory: &impl VirtualMemory,
201    addr: u64,
202    eew: Eew,
203) -> Result<[u8; Eew::MAX_BYTES as usize], VirtualMemoryError> {
204    let mut out = [0; _];
205    out[..usize::from(eew.bytes())]
206        .copy_from_slice(memory.read_slice(addr, u32::from(eew.bytes()))?);
207    Ok(out)
208}
209
210/// Execute a unit-stride or unit-stride segment load (including fault-only-first variants).
211///
212/// Segment stride between elements is `nf * eew.bytes()`. Field `f` for element `i` is at
213/// `base + i * nf * eew.bytes() + f * eew.bytes()`. When `nf == 1` this degenerates to a
214/// plain unit-stride load.
215///
216/// When `fault_only_first` is set: a memory error at element `i > 0` truncates `vl` to `i`
217/// and returns `Ok`. An error at element `0` always propagates.
218///
219/// # Safety
220/// - `nf <= MAX_NF`
221/// - `vd.bits() % group_regs == 0`
222/// - `vd.bits() + nf * group_regs <= 32`
223/// - `vl <= group_regs * VLENB / eew.bytes()` (all `vl` elements fit within the destination
224///   register group; this holds when `vl` is the architectural `vl` and `group_regs` is the EMUL
225///   register count for the given `eew` and `vtype`)
226/// - When `vm=false`: `vd` does not overlap `v0` (i.e. `vd.bits() != 0`)
227#[inline(always)]
228#[expect(clippy::too_many_arguments, reason = "Internal API")]
229#[doc(hidden)]
230pub unsafe fn execute_unit_stride_load<Reg, ExtState, Memory, PC, IH, CustomError>(
231    state: &mut InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
232    vd: VReg,
233    vm: bool,
234    vl: u32,
235    vstart: u32,
236    base: u64,
237    eew: Eew,
238    group_regs: u8,
239    nf: u8,
240    fault_only_first: bool,
241) -> Result<(), ExecutionError<Reg::Type, CustomError>>
242where
243    Reg: Register,
244    [(); Reg::N]:,
245    ExtState: VectorRegistersExt<Reg, CustomError>,
246    [(); ExtState::ELEN as usize]:,
247    [(); ExtState::VLEN as usize]:,
248    [(); ExtState::VLENB as usize]:,
249    Memory: VirtualMemory,
250    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
251    CustomError: fmt::Debug,
252{
253    let elem_bytes = eew.bytes();
254    let segment_stride = u64::from(nf) * u64::from(elem_bytes);
255
256    // SAFETY: `vl <= VLMAX <= VLEN`, so `vl.div_ceil(8) <= VLENB`.
257    let mask_buf = unsafe { snapshot_mask(state.ext_state.read_vreg(), vm, vl) };
258
259    for i in vstart..vl {
260        if !vm && !mask_bit(&mask_buf, i) {
261            continue;
262        }
263
264        let elem_base = base.wrapping_add(u64::from(i) * segment_stride);
265
266        // Read all nf fields into a stack buffer before writing any of them.
267        // This ensures a fault on field f>0 leaves the destination registers
268        // untouched for the faulting element, so only elements with index
269        // new_vl are ever written (fault-only-first semantics).
270        //
271        // Sized by `MAX_NF * Eew::MAX_BYTES`: the V spec allows at most 8
272        // fields (nf in 1..=8) each is at most 8 bytes (E64), giving 64 bytes.
273        let mut field_buf = [[0u8; usize::from(Eew::MAX_BYTES)]; usize::from(MAX_NF)];
274
275        for f in 0..nf {
276            let addr = elem_base.wrapping_add(u64::from(f) * u64::from(elem_bytes));
277            match read_mem_element(&state.memory, addr, eew) {
278                Ok(data) => {
279                    // SAFETY: `f < nf` and the precondition on this function requires
280                    // `nf <= MAX_NF` (the V spec encodes nf in 3 bits giving 1..=8 =
281                    // MAX_NF, and the decoder enforces this before constructing the
282                    // instruction). Therefore, `f as usize < nf as usize <= MAX_NF`,
283                    // which is exactly the length of `field_buf`.
284                    unsafe {
285                        *field_buf.get_unchecked_mut(f as usize) = data;
286                    }
287                }
288                Err(mem_err) => {
289                    if fault_only_first && i > 0 {
290                        state.ext_state.set_vl(i);
291                        state.ext_state.mark_vs_dirty();
292                        state.ext_state.reset_vstart();
293                        return Ok(());
294                    }
295                    if i > vstart {
296                        // Elements [vstart, i) were committed; VS is now dirty.
297                        state.ext_state.mark_vs_dirty();
298                        // vstart records the faulting element for restartability.
299                        state.ext_state.set_vstart(i as u16);
300                    }
301                    return Err(ExecutionError::MemoryAccess(mem_err));
302                }
303            }
304        }
305
306        // All nf fields for element i were read successfully; commit to the register file.
307        for f in 0..nf {
308            let field_base_reg = vd.bits() + f * group_regs;
309            // SAFETY: need `field_base_reg + i / (VLENB / elem_bytes) < 32`.
310            //
311            // Let `elems_per_reg = VLENB / elem_bytes`.
312            // `i < vl <= group_regs * elems_per_reg` (precondition), so
313            // `i / elems_per_reg < group_regs`.
314            //
315            // `field_base_reg = vd.bits() + f * group_regs`. Since `f < nf` and the
316            // precondition guarantees `vd.bits() + nf * group_regs <= 32`:
317            // `field_base_reg + group_regs <= vd.bits() + (f+1) * group_regs
318            //                             <= vd.bits() + nf * group_regs <= 32`.
319            //
320            // Therefore, `field_base_reg + i / elems_per_reg
321            //            < field_base_reg + group_regs <= 32`.
322            //
323            // For `field_buf`: `f < nf <= MAX_NF` (the same argument as in the read loop
324            // above), so `f as usize < MAX_NF = field_buf.len()`.
325            unsafe {
326                write_group_element(
327                    state.ext_state.write_vreg(),
328                    field_base_reg,
329                    i,
330                    eew,
331                    *field_buf.get_unchecked(f as usize),
332                );
333            }
334        }
335    }
336
337    state.ext_state.mark_vs_dirty();
338    state.ext_state.reset_vstart();
339    Ok(())
340}
341
342/// Execute a strided or strided segment load.
343///
344/// `addr[i] = base + i * stride` where `stride` is a signed XLEN-wide value. Field `f` of
345/// element `i` is at `addr[i] + f * eew.bytes()`.
346///
347/// # Safety
348/// - `vd.bits() % group_regs == 0`
349/// - `vd.bits() + nf * group_regs <= 32`
350/// - `vl <= group_regs * VLENB / eew.bytes()`
351/// - When `vm=false`: `vd` does not overlap `v0` (i.e. `vd.bits() != 0`)
352#[inline(always)]
353#[expect(clippy::too_many_arguments, reason = "Internal API")]
354#[doc(hidden)]
355pub unsafe fn execute_strided_load<Reg, ExtState, Memory, PC, IH, CustomError>(
356    state: &mut InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
357    vd: VReg,
358    vm: bool,
359    vl: u32,
360    vstart: u32,
361    base: u64,
362    stride: i64,
363    eew: Eew,
364    group_regs: u8,
365    nf: u8,
366) -> Result<(), ExecutionError<Reg::Type, CustomError>>
367where
368    Reg: Register,
369    [(); Reg::N]:,
370    ExtState: VectorRegistersExt<Reg, CustomError>,
371    [(); ExtState::ELEN as usize]:,
372    [(); ExtState::VLEN as usize]:,
373    [(); ExtState::VLENB as usize]:,
374    Memory: VirtualMemory,
375    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
376    CustomError: fmt::Debug,
377{
378    let elem_bytes = eew.bytes();
379
380    // SAFETY: `vl <= VLMAX <= VLEN` (precondition), so `vl.div_ceil(8) <= VLEN / 8 = VLENB`.
381    let mask_buf = unsafe { snapshot_mask(state.ext_state.read_vreg(), vm, vl) };
382
383    for i in vstart..vl {
384        if !vm && !mask_bit(&mask_buf, i) {
385            continue;
386        }
387
388        let elem_base = base.wrapping_add(i64::from(i).wrapping_mul(stride).cast_unsigned());
389
390        for f in 0..nf {
391            let addr = elem_base.wrapping_add(u64::from(f) * u64::from(elem_bytes));
392            let data = match read_mem_element(&state.memory, addr, eew) {
393                Ok(data) => data,
394                Err(mem_err) => {
395                    if f > 0 || i > vstart {
396                        state.ext_state.mark_vs_dirty();
397                        state.ext_state.set_vstart(i as u16);
398                    }
399                    return Err(ExecutionError::MemoryAccess(mem_err));
400                }
401            };
402            let field_base_reg = vd.bits() + f * group_regs;
403            // SAFETY: need `field_base_reg + i / (VLENB / elem_bytes) < 32`.
404            //
405            // Let `elems_per_reg = VLENB / elem_bytes`.
406            // `i < vl <= group_regs * elems_per_reg` (precondition), so
407            // `i / elems_per_reg < group_regs`.
408            //
409            // `field_base_reg = vd.bits() + f * group_regs`. Since `f < nf` and
410            // `vd.bits() + nf * group_regs <= 32` (precondition):
411            // `field_base_reg + group_regs <= vd.bits() + (f+1) * group_regs
412            //                             <= vd.bits() + nf * group_regs <= 32`.
413            //
414            // Therefore, `field_base_reg + i / elems_per_reg < field_base_reg + group_regs <= 32`.
415            unsafe {
416                write_group_element(state.ext_state.write_vreg(), field_base_reg, i, eew, data);
417            }
418        }
419    }
420
421    state.ext_state.mark_vs_dirty();
422    state.ext_state.reset_vstart();
423    Ok(())
424}
425
426/// Execute an indexed (unordered or ordered) or indexed segment load.
427///
428/// For element `i`, reads `index_eew`-sized bytes from register group `vs2` at element `i`
429/// to obtain a zero-extended byte offset, then loads `nf` data fields from
430/// `base + offset + f * data_eew.bytes()`. Unordered vs ordered is functionally identical in
431/// a software interpreter.
432///
433/// # Safety
434/// - `vd.bits() % data_group_regs == 0`
435/// - `vd.bits() + nf * data_group_regs <= 32`
436/// - `vs2.bits() + (vl - 1) / (VLENB / index_eew.bytes()) < 32` (all `vl` index elements fit within
437///   the register file; satisfied when `vs2` is alignment-checked against `EMUL_index` and `vl` is
438///   the architectural `vl` bounded by `VLMAX`)
439/// - `vl <= data_group_regs * VLENB / data_eew.bytes()` (all `vl` elements fit in a data group)
440/// - When `vm=false`: `vd` does not overlap `v0` (i.e. `vd.bits() != 0`)
441#[inline(always)]
442#[expect(clippy::too_many_arguments, reason = "Internal API")]
443#[doc(hidden)]
444pub unsafe fn execute_indexed_load<Reg, ExtState, Memory, PC, IH, CustomError>(
445    state: &mut InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
446    vd: VReg,
447    vs2: VReg,
448    vm: bool,
449    vl: u32,
450    vstart: u32,
451    base: u64,
452    data_eew: Eew,
453    index_eew: Eew,
454    data_group_regs: u8,
455    nf: u8,
456) -> Result<(), ExecutionError<Reg::Type, CustomError>>
457where
458    Reg: Register,
459    [(); Reg::N]:,
460    ExtState: VectorRegistersExt<Reg, CustomError>,
461    [(); ExtState::ELEN as usize]:,
462    [(); ExtState::VLEN as usize]:,
463    [(); ExtState::VLENB as usize]:,
464    Memory: VirtualMemory,
465    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
466    CustomError: fmt::Debug,
467{
468    let index_base_reg = usize::from(vs2.bits());
469
470    // SAFETY: `vl <= VLMAX <= VLEN` (precondition), so `vl.div_ceil(8) <= VLEN / 8 = VLENB`.
471    let mask_buf = unsafe { snapshot_mask(state.ext_state.read_vreg(), vm, vl) };
472
473    for i in vstart..vl {
474        if !vm && !mask_bit(&mask_buf, i) {
475            continue;
476        }
477
478        // SAFETY: need `index_base_reg + i / (VLENB / index_eew.bytes()) < 32`.
479        //
480        // The caller verified `vs2` is aligned to `EMUL_index` registers and that
481        // `vs2.bits() + EMUL_index <= 32`. `EMUL_index` is defined so that
482        // `EMUL_index * (VLENB / index_eew.bytes()) = VLMAX`. Since `i < vl <= VLMAX`,
483        // `i / (VLENB / index_eew.bytes()) < EMUL_index`, and therefore
484        // `index_base_reg + i / (VLENB / index_eew.bytes()) < index_base_reg + EMUL_index <= 32`.
485        let index_buf = unsafe {
486            read_group_element(state.ext_state.read_vreg(), index_base_reg, i, index_eew)
487        };
488        let offset = u64::from_le_bytes(index_buf);
489        let elem_addr = base.wrapping_add(offset);
490
491        let data_elem_bytes = data_eew.bytes();
492        for f in 0..nf {
493            let addr = elem_addr.wrapping_add(u64::from(f) * u64::from(data_elem_bytes));
494            let data = match read_mem_element(&state.memory, addr, data_eew) {
495                Ok(data) => data,
496                Err(mem_err) => {
497                    if f > 0 || i > vstart {
498                        state.ext_state.mark_vs_dirty();
499                        state.ext_state.set_vstart(i as u16);
500                    }
501                    return Err(ExecutionError::MemoryAccess(mem_err));
502                }
503            };
504            let field_base_reg = vd.bits() + f * data_group_regs;
505            // SAFETY: need `field_base_reg + i / (VLENB / data_eew.bytes()) < 32`.
506            //
507            // Let `data_elems_per_reg = VLENB / data_eew.bytes()`.
508            // `i < vl <= data_group_regs * data_elems_per_reg` (precondition), so
509            // `i / data_elems_per_reg < data_group_regs`.
510            //
511            // `field_base_reg = vd.bits() + f * data_group_regs`. Since `f < nf` and
512            // `vd.bits() + nf * data_group_regs <= 32` (precondition):
513            // `field_base_reg + data_group_regs <= vd.bits() + (f+1) * data_group_regs
514            //                                  <= vd.bits() + nf * data_group_regs <= 32`.
515            //
516            // Therefore,
517            // `field_base_reg + i / data_elems_per_reg < field_base_reg + data_group_regs <= 32`.
518            unsafe {
519                write_group_element(
520                    state.ext_state.write_vreg(),
521                    field_base_reg,
522                    i,
523                    data_eew,
524                    data,
525                );
526            }
527        }
528    }
529
530    state.ext_state.mark_vs_dirty();
531    state.ext_state.reset_vstart();
532    Ok(())
533}