Skip to main content

ab_riscv_interpreter/v/zve64x/widen_narrow/
zve64x_widen_narrow_helpers.rs

1//! Opaque helpers for Zve64x extension
2
3use crate::v::vector_registers::VectorRegistersExt;
4pub use crate::v::zve64x::arith::zve64x_arith_helpers::{OpSrc, check_vreg_group_alignment};
5use crate::v::zve64x::zve64x_helpers::INSTRUCTION_SIZE;
6use crate::{ExecutionError, InterpreterState, ProgramCounter, VirtualMemory};
7use ab_riscv_primitives::instructions::v::Vsew;
8use ab_riscv_primitives::registers::general_purpose::Register;
9use ab_riscv_primitives::registers::vector::VReg;
10use core::fmt;
11
12/// Check that a widening destination `vd` is aligned to `wide_group_regs` and fits within
13/// `[0,32)`, without any source overlap check
14#[inline(always)]
15#[doc(hidden)]
16pub fn check_vd_widen_no_src_check<Reg, ExtState, Memory, PC, IH, CustomError>(
17    state: &InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
18    vd: VReg,
19    wide_group_regs: u8,
20) -> Result<(), ExecutionError<Reg::Type, CustomError>>
21where
22    Reg: Register,
23    [(); Reg::N]:,
24    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
25{
26    let vd_idx = vd.bits();
27    if !vd_idx.is_multiple_of(wide_group_regs) || vd_idx + wide_group_regs > 32 {
28        return Err(ExecutionError::IllegalInstruction {
29            address: state.instruction_fetcher.old_pc(INSTRUCTION_SIZE),
30        });
31    }
32    Ok(())
33}
34
35/// Check that an extension source `vs2` is aligned to `src_group_regs`, fits in `[0,32)`, and
36/// does not overlap `vd` (which occupies `group_regs` registers).
37#[inline(always)]
38#[doc(hidden)]
39pub fn check_vs_ext_alignment<Reg, ExtState, Memory, PC, IH, CustomError>(
40    state: &InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
41    vs2: VReg,
42    src_group_regs: u8,
43    vd: VReg,
44    group_regs: u8,
45) -> Result<(), ExecutionError<Reg::Type, CustomError>>
46where
47    Reg: Register,
48    [(); Reg::N]:,
49    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
50{
51    let vs2_idx = vs2.bits();
52    if !vs2_idx.is_multiple_of(src_group_regs) || vs2_idx + src_group_regs > 32 {
53        return Err(ExecutionError::IllegalInstruction {
54            address: state.instruction_fetcher.old_pc(INSTRUCTION_SIZE),
55        });
56    }
57    // vd and vs2 must not overlap
58    if ranges_overlap(vd.bits(), group_regs, vs2_idx, src_group_regs) {
59        return Err(ExecutionError::IllegalInstruction {
60            address: state.instruction_fetcher.old_pc(INSTRUCTION_SIZE),
61        });
62    }
63    Ok(())
64}
65
66/// Check that a widening destination `vd` is aligned to `wide_group_regs`, does not overlap the
67/// `group_regs` registers starting at `vs_a` or `vs_b`, and fits within `[0, 32)`.
68///
69/// `wide_group_regs` is the pre-computed register count for the wide EMUL (2*LMUL), obtained via
70/// `Vlmul::index_register_count(wide_eew, sew)`. `group_regs` is the narrow LMUL register count.
71#[inline(always)]
72#[doc(hidden)]
73pub fn check_vd_widen_alignment<Reg, ExtState, Memory, PC, IH, CustomError>(
74    state: &InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
75    vd: VReg,
76    vs_a: VReg,
77    vs_b_opt: Option<VReg>,
78    group_regs: u8,
79    wide_group_regs: u8,
80) -> Result<(), ExecutionError<Reg::Type, CustomError>>
81where
82    Reg: Register,
83    [(); Reg::N]:,
84    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
85{
86    let vd_idx = vd.bits();
87    if !vd_idx.is_multiple_of(wide_group_regs) || vd_idx + wide_group_regs > 32 {
88        return Err(ExecutionError::IllegalInstruction {
89            address: state.instruction_fetcher.old_pc(INSTRUCTION_SIZE),
90        });
91    }
92    let va_idx = vs_a.bits();
93    if ranges_overlap(vd_idx, wide_group_regs, va_idx, group_regs) {
94        return Err(ExecutionError::IllegalInstruction {
95            address: state.instruction_fetcher.old_pc(INSTRUCTION_SIZE),
96        });
97    }
98    if let Some(vs_b) = vs_b_opt {
99        let vb_idx = vs_b.bits();
100        if ranges_overlap(vd_idx, wide_group_regs, vb_idx, group_regs) {
101            return Err(ExecutionError::IllegalInstruction {
102                address: state.instruction_fetcher.old_pc(INSTRUCTION_SIZE),
103            });
104        }
105    }
106    Ok(())
107}
108
109/// Check that a widening source `vs2` that is already 2×SEW wide is aligned to `wide_group_regs`
110/// and fits within `[0, 32)`.
111#[inline(always)]
112#[doc(hidden)]
113pub fn check_vs_wide_alignment<Reg, ExtState, Memory, PC, IH, CustomError>(
114    state: &InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
115    vs: VReg,
116    wide_group_regs: u8,
117) -> Result<(), ExecutionError<Reg::Type, CustomError>>
118where
119    Reg: Register,
120    [(); Reg::N]:,
121    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
122{
123    let vs_idx = vs.bits();
124    if !vs_idx.is_multiple_of(wide_group_regs) || vs_idx + wide_group_regs > 32 {
125        return Err(ExecutionError::IllegalInstruction {
126            address: state.instruction_fetcher.old_pc(INSTRUCTION_SIZE),
127        });
128    }
129    Ok(())
130}
131
132/// Check that a narrowing destination `vd` is aligned to `group_regs` and fits
133/// within `[0, 32)`.
134///
135/// No overlap check against `vs2` is performed here because narrowing instructions
136/// permit `vd` to alias the low half of the wide `vs2` register group per spec §11.7.
137#[inline(always)]
138#[doc(hidden)]
139pub fn check_vd_narrow_alignment<Reg, ExtState, Memory, PC, IH, CustomError>(
140    state: &InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
141    vd: VReg,
142    group_regs: u8,
143) -> Result<(), ExecutionError<Reg::Type, CustomError>>
144where
145    Reg: Register,
146    [(); Reg::N]:,
147    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
148{
149    let vd_idx = vd.bits();
150    if !vd_idx.is_multiple_of(group_regs) || vd_idx + group_regs > 32 {
151        return Err(ExecutionError::IllegalInstruction {
152            address: state.instruction_fetcher.old_pc(INSTRUCTION_SIZE),
153        });
154    }
155    Ok(())
156}
157
158/// Returns `true` when `[a_start, a_start+a_len)` overlaps `[b_start, b_start+b_len)`.
159#[inline(always)]
160fn ranges_overlap(a_start: u8, a_len: u8, b_start: u8, b_len: u8) -> bool {
161    a_start < b_start + b_len && b_start < a_start + a_len
162}
163
164/// Return whether mask bit `i` is set in the mask byte slice (LSB-first within each byte).
165#[inline(always)]
166fn mask_bit(mask: &[u8], i: u32) -> bool {
167    mask.get((i / u8::BITS) as usize)
168        .is_some_and(|b| (b >> (i % u8::BITS)) & 1 != 0)
169}
170
171/// Snapshot the mask register into a stack buffer.
172///
173/// When `vm=true` (unmasked), all bytes are `0xff`.
174///
175/// # Safety
176/// `vl.div_ceil(8) <= VLENB` must hold. This is guaranteed when `vl <= VLEN`.
177#[inline(always)]
178unsafe fn snapshot_mask<const VLENB: usize>(
179    vreg: &[[u8; VLENB]; 32],
180    vm: bool,
181    vl: u32,
182) -> [u8; VLENB] {
183    let mut buf = [0u8; VLENB];
184    if vm {
185        buf = [0xffu8; VLENB];
186    } else {
187        let mask_bytes = vl.div_ceil(u8::BITS) as usize;
188        // SAFETY: `mask_bytes <= VLENB` by precondition
189        unsafe {
190            buf.get_unchecked_mut(..mask_bytes)
191                .copy_from_slice(vreg[usize::from(VReg::V0.bits())].get_unchecked(..mask_bytes));
192        }
193    }
194    buf
195}
196
197/// Read the low `sew_bytes` of element `elem_i` from register group `base_reg`, zero-extended to
198/// `u64`.
199///
200/// # Safety
201/// `base_reg + elem_i / (VLENB / sew_bytes) < 32`
202#[inline(always)]
203unsafe fn read_element_u64<const VLENB: usize>(
204    vreg: &[[u8; VLENB]; 32],
205    base_reg: usize,
206    elem_i: u32,
207    sew_bytes: usize,
208) -> u64 {
209    let elems_per_reg = VLENB / sew_bytes;
210    let reg_off = elem_i as usize / elems_per_reg;
211    let byte_off = (elem_i as usize % elems_per_reg) * sew_bytes;
212    // SAFETY: `base_reg + reg_off < 32` by caller's precondition
213    let reg = unsafe { vreg.get_unchecked(base_reg + reg_off) };
214    // SAFETY: `byte_off + sew_bytes <= VLENB`
215    let src = unsafe { reg.get_unchecked(byte_off..byte_off + sew_bytes) };
216    let mut buf = [0u8; 8];
217    // SAFETY: `sew_bytes <= 8`
218    unsafe { buf.get_unchecked_mut(..sew_bytes) }.copy_from_slice(src);
219    u64::from_le_bytes(buf)
220}
221
222/// Write the low `sew_bytes` of `value` into element `elem_i` in register group `base_reg`.
223///
224/// # Safety
225/// `base_reg + elem_i / (VLENB / sew_bytes) < 32`
226#[inline(always)]
227unsafe fn write_element_u64<const VLENB: usize>(
228    vreg: &mut [[u8; VLENB]; 32],
229    base_reg: u8,
230    elem_i: u32,
231    sew_bytes: usize,
232    value: u64,
233) {
234    let elems_per_reg = VLENB / sew_bytes;
235    let reg_off = elem_i as usize / elems_per_reg;
236    let byte_off = (elem_i as usize % elems_per_reg) * sew_bytes;
237    let buf = value.to_le_bytes();
238    // SAFETY: `base_reg + reg_off < 32` by caller's precondition
239    let reg = unsafe { vreg.get_unchecked_mut(usize::from(base_reg) + reg_off) };
240    // SAFETY: `byte_off + sew_bytes <= VLENB`
241    let dst = unsafe { reg.get_unchecked_mut(byte_off..byte_off + sew_bytes) };
242    // SAFETY: `sew_bytes <= 8`
243    dst.copy_from_slice(unsafe { buf.get_unchecked(..sew_bytes) });
244}
245
246/// Sign-extend the low `sew_bits` of `val` to `i64`.
247#[inline(always)]
248#[doc(hidden)]
249pub fn sign_extend_bits(val: u64, sew_bits: u32) -> i64 {
250    let shift = u64::BITS - sew_bits;
251    (val.cast_signed() << shift) >> shift
252}
253
254/// Execute a widening integer add/subtract.
255///
256/// Each source element is SEW-wide; the destination element is 2×SEW-wide.
257/// `zero_extend_a` and `zero_extend_b` select unsigned vs signed widening for each source
258/// (unsigned = zero-extend, signed = sign-extend).
259///
260/// `op` receives `(wide_a: u64, wide_b: u64) -> u64`.
261///
262/// # Safety
263/// - `vd` aligned to `2*group_regs`, fits in `[0,32)`, does not overlap `vs2` or `src` (verified by
264///   caller)
265/// - `vs2` aligned to `group_regs`, fits in `[0,32)` (verified by caller)
266/// - `src` register (when `WidenSrc::Vreg`) aligned to `group_regs`, fits in `[0,32)` (verified by
267///   caller)
268/// - `vl <= group_regs * VLENB / sew_bytes` (all elements fit)
269/// - SEW < 64 (wide_sew_bytes <= 8)
270/// - When `vm=false`: `vd.bits() != 0`
271#[inline(always)]
272#[expect(clippy::too_many_arguments, reason = "Internal API")]
273#[doc(hidden)]
274pub unsafe fn execute_widen_op<Reg, ExtState, Memory, PC, IH, CustomError, F>(
275    state: &mut InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
276    vd: VReg,
277    vs2: VReg,
278    src: OpSrc,
279    vm: bool,
280    vl: u32,
281    vstart: u32,
282    sew: Vsew,
283    zero_extend_a: bool,
284    zero_extend_b: bool,
285    op: F,
286) where
287    Reg: Register,
288    [(); Reg::N]:,
289    ExtState: VectorRegistersExt<Reg, CustomError>,
290    [(); ExtState::ELEN as usize]:,
291    [(); ExtState::VLEN as usize]:,
292    [(); ExtState::VLENB as usize]:,
293    Memory: VirtualMemory,
294    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
295    CustomError: fmt::Debug,
296    F: Fn(u64, u64) -> u64,
297{
298    let sew_bytes = usize::from(sew.bytes());
299    // 2×SEW in bytes; SEW < 64 is enforced by caller, so this is at most 8
300    let wide_sew_bytes = sew_bytes * 2;
301    let sew_bits = u32::from(sew.bits());
302
303    // SAFETY: `vl <= VLMAX <= VLEN`, so `vl.div_ceil(8) <= VLENB`
304    let mask_buf = unsafe { snapshot_mask(state.ext_state.read_vreg(), vm, vl) };
305    let vd_base = vd.bits();
306    let vs2_base = vs2.bits();
307
308    for i in vstart..vl {
309        if !mask_bit(&mask_buf, i) {
310            continue;
311        }
312        // SAFETY: `vs2` aligned to `group_regs`; `i < vl <= group_regs * (VLENB / sew_bytes)`
313        let raw_a = unsafe {
314            read_element_u64(
315                state.ext_state.read_vreg(),
316                usize::from(vs2_base),
317                i,
318                sew_bytes,
319            )
320        };
321        let wide_a = if zero_extend_a {
322            raw_a
323        } else {
324            sign_extend_bits(raw_a, sew_bits).cast_unsigned()
325        };
326        let wide_b = match &src {
327            OpSrc::Vreg(vs1_base) => {
328                // SAFETY: same argument as vs2
329                let raw_b = unsafe {
330                    read_element_u64(
331                        state.ext_state.read_vreg(),
332                        usize::from(*vs1_base),
333                        i,
334                        sew_bytes,
335                    )
336                };
337                if zero_extend_b {
338                    raw_b
339                } else {
340                    sign_extend_bits(raw_b, sew_bits).cast_unsigned()
341                }
342            }
343            OpSrc::Scalar(val) => {
344                // Truncate scalar to SEW bits, then apply the same zero/sign extension
345                // as vector elements. Per spec, only the low SEW bits of the scalar
346                // participate as the source operand.
347                let truncated = val & ((1u64 << sew_bits) - 1);
348                if zero_extend_b {
349                    truncated
350                } else {
351                    sign_extend_bits(truncated, sew_bits).cast_unsigned()
352                }
353            }
354        };
355        let result = op(wide_a, wide_b);
356        // SAFETY: `vd` aligned to `2*group_regs`; `i < vl <= group_regs * (VLENB / sew_bytes)`
357        // so `i < 2*group_regs * (VLENB / wide_sew_bytes)` - element fits in the wide group
358        unsafe {
359            write_element_u64(
360                state.ext_state.write_vreg(),
361                vd_base,
362                i,
363                wide_sew_bytes,
364                result,
365            );
366        }
367    }
368    state.ext_state.mark_vs_dirty();
369    state.ext_state.reset_vstart();
370}
371
372/// Execute a widening add/subtract where `vs2` is already 2×SEW wide.
373///
374/// `vs2` is read at `wide_sew_bytes`; `src` (narrow) is read at `sew_bytes` and widened.
375/// `zero_extend_b` selects unsigned vs signed widening for the narrow source operand.
376///
377/// # Safety
378/// - `vd` aligned to `2*group_regs`, fits in `[0,32)`, does not overlap `vs2` or `src`
379/// - `vs2` aligned to `2*group_regs`, fits in `[0,32)` (wide source)
380/// - `src` register (when `WidenSrc::Vreg`) aligned to `group_regs`, fits in `[0,32)`
381/// - `vl <= group_regs * VLENB / sew_bytes`
382/// - SEW < 64
383/// - When `vm=false`: `vd.bits() != 0`
384#[inline(always)]
385#[expect(clippy::too_many_arguments, reason = "Internal API")]
386#[doc(hidden)]
387pub unsafe fn execute_widen_w_op<Reg, ExtState, Memory, PC, IH, CustomError, F>(
388    state: &mut InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
389    vd: VReg,
390    vs2: VReg,
391    src: OpSrc,
392    vm: bool,
393    vl: u32,
394    vstart: u32,
395    sew: Vsew,
396    zero_extend_b: bool,
397    op: F,
398) where
399    Reg: Register,
400    [(); Reg::N]:,
401    ExtState: VectorRegistersExt<Reg, CustomError>,
402    [(); ExtState::ELEN as usize]:,
403    [(); ExtState::VLEN as usize]:,
404    [(); ExtState::VLENB as usize]:,
405    Memory: VirtualMemory,
406    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
407    CustomError: fmt::Debug,
408    F: Fn(u64, u64) -> u64,
409{
410    let sew_bytes = usize::from(sew.bytes());
411    let wide_sew_bytes = sew_bytes * 2;
412    let sew_bits = u32::from(sew.bits());
413
414    // SAFETY: `vl <= VLEN`
415    let mask_buf = unsafe { snapshot_mask(state.ext_state.read_vreg(), vm, vl) };
416    let vd_base = vd.bits();
417    let vs2_base = vs2.bits();
418
419    for i in vstart..vl {
420        if !mask_bit(&mask_buf, i) {
421            continue;
422        }
423        // vs2 is already 2×SEW; read at wide width
424        // SAFETY: `vs2` aligned to `2*group_regs`; element `i` fits within it
425        let wide_a = unsafe {
426            read_element_u64(
427                state.ext_state.read_vreg(),
428                usize::from(vs2_base),
429                i,
430                wide_sew_bytes,
431            )
432        };
433        let wide_b = match &src {
434            OpSrc::Vreg(vs1_base) => {
435                // SAFETY: `vs1` is aligned to `group_regs` and fits within `[0, 32)`,
436                // verified by caller; `i < vl <= group_regs * (VLENB / sew_bytes)`,
437                // so `vs1_base + i / elems_per_reg < vs1_base + group_regs <= 32`
438                let raw_b = unsafe {
439                    read_element_u64(
440                        state.ext_state.read_vreg(),
441                        usize::from(*vs1_base),
442                        i,
443                        sew_bytes,
444                    )
445                };
446                if zero_extend_b {
447                    raw_b
448                } else {
449                    sign_extend_bits(raw_b, sew_bits).cast_unsigned()
450                }
451            }
452            OpSrc::Scalar(val) => {
453                // Truncate scalar to SEW bits before widening
454                let truncated = val & ((1u64 << sew_bits) - 1);
455                if zero_extend_b {
456                    truncated
457                } else {
458                    sign_extend_bits(truncated, sew_bits).cast_unsigned()
459                }
460            }
461        };
462        let result = op(wide_a, wide_b);
463        // SAFETY: same as `execute_widen_op` for vd
464        unsafe {
465            write_element_u64(
466                state.ext_state.write_vreg(),
467                vd_base,
468                i,
469                wide_sew_bytes,
470                result,
471            );
472        }
473    }
474    state.ext_state.mark_vs_dirty();
475    state.ext_state.reset_vstart();
476}
477
478/// Execute a narrowing right-shift.
479///
480/// `vs2` is 2×SEW wide; the shift amount comes from `src` (SEW-wide or scalar).
481/// The shift amount is masked to `log2(2*SEW)` bits per spec §12.6.
482/// `arithmetic` selects sign-extending (true) vs zero-extending (false) before shifting.
483///
484/// # Safety
485/// - `vd` aligned to `group_regs`, fits in `[0,32)`
486/// - `vs2` aligned to `wide_group_regs`, fits in `[0,32)`; aliasing with the low half of `vs2` is
487///   permitted per spec §11.7 - reads complete before writes to any overlapping element since the
488///   destination SEW is half the source SEW
489/// - `src` register (when `OpSrc::Vreg`) aligned to `group_regs`, fits in `[0,32)`
490/// - `vl <= group_regs * VLENB / sew_bytes`
491/// - SEW < 64
492/// - When `vm=false`: `vd.bits() != 0`
493#[inline(always)]
494#[expect(clippy::too_many_arguments, reason = "Internal API")]
495#[doc(hidden)]
496pub unsafe fn execute_narrow_shift<Reg, ExtState, Memory, PC, IH, CustomError>(
497    state: &mut InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
498    vd: VReg,
499    vs2: VReg,
500    src: OpSrc,
501    vm: bool,
502    vl: u32,
503    vstart: u32,
504    sew: Vsew,
505    arithmetic: bool,
506) where
507    Reg: Register,
508    [(); Reg::N]:,
509    ExtState: VectorRegistersExt<Reg, CustomError>,
510    [(); ExtState::ELEN as usize]:,
511    [(); ExtState::VLEN as usize]:,
512    [(); ExtState::VLENB as usize]:,
513    Memory: VirtualMemory,
514    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
515    CustomError: fmt::Debug,
516{
517    let sew_bytes = usize::from(sew.bytes());
518    let wide_sew_bytes = sew_bytes * 2;
519    // Shift amount mask: log2(2*SEW) bits = log2(SEW) + 1 bits
520    // 2*SEW in bits
521    let wide_sew_bits = u32::from(sew.bits()) * 2;
522    let shamt_mask = u64::from(wide_sew_bits - 1);
523
524    // SAFETY: `vl <= VLEN`
525    let mask_buf = unsafe { snapshot_mask(state.ext_state.read_vreg(), vm, vl) };
526    let vd_base = vd.bits();
527    let vs2_base = vs2.bits();
528
529    for i in vstart..vl {
530        if !mask_bit(&mask_buf, i) {
531            continue;
532        }
533        // SAFETY: `vs2` is the wide source group
534        let wide_val = unsafe {
535            read_element_u64(
536                state.ext_state.read_vreg(),
537                usize::from(vs2_base),
538                i,
539                wide_sew_bytes,
540            )
541        };
542        let shamt = match &src {
543            OpSrc::Vreg(vs1_base) => {
544                // SAFETY: `vs1` is aligned to `group_regs` and fits within `[0, 32)`,
545                // verified by caller; `i < vl <= group_regs * (VLENB / sew_bytes)`,
546                // so `vs1_base + i / elems_per_reg < vs1_base + group_regs <= 32`
547                let raw = unsafe {
548                    read_element_u64(
549                        state.ext_state.read_vreg(),
550                        usize::from(*vs1_base),
551                        i,
552                        sew_bytes,
553                    )
554                };
555                raw & shamt_mask
556            }
557            // Scalar shift amount: only the low log2(2*SEW) bits are used per spec
558            OpSrc::Scalar(val) => val & shamt_mask,
559        };
560        let result_wide = if arithmetic {
561            // Sign-extend to i64 first, then shift arithmetically as i64 to
562            // preserve sign bits, then cast back. Shifting u64 after cast_unsigned()
563            // would be a logical shift and lose sign bits.
564            (sign_extend_bits(wide_val, wide_sew_bits) >> shamt).cast_unsigned()
565        } else {
566            wide_val >> shamt
567        };
568        // Truncate to SEW bits
569        let result = result_wide & ((1u64 << sew.bits()) - 1);
570        // SAFETY: `vd` is the narrow destination group
571        unsafe {
572            write_element_u64(state.ext_state.write_vreg(), vd_base, i, sew_bytes, result);
573        }
574    }
575    state.ext_state.mark_vs_dirty();
576    state.ext_state.reset_vstart();
577}
578
579/// Execute an integer extension (vzext/vsext).
580///
581/// Source element width is `sew_bytes / factor`; destination is `sew_bytes`.
582/// `sign_extend` selects sign- vs zero-extension.
583///
584/// The source EMUL = LMUL / factor; the source register group is `max(1, group_regs / factor)`
585/// registers.
586///
587/// # Safety
588/// - `vd` aligned to `group_regs`, fits in `[0,32)`
589/// - `vs2` aligned to `src_group_regs`, fits in `[0,32)`, does not overlap `vd`
590/// - `vl <= group_regs * VLENB / sew_bytes`
591/// - `sew_bytes / factor >= 1` (SEW >= factor*8)
592/// - When `vm=false`: `vd.bits() != 0`
593#[inline(always)]
594#[expect(clippy::too_many_arguments, reason = "Internal API")]
595#[doc(hidden)]
596pub unsafe fn execute_extension<Reg, ExtState, Memory, PC, IH, CustomError>(
597    state: &mut InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
598    vd: VReg,
599    vs2: VReg,
600    vm: bool,
601    vl: u32,
602    vstart: u32,
603    sew: Vsew,
604    factor: u8,
605    sign: bool,
606) where
607    Reg: Register,
608    [(); Reg::N]:,
609    ExtState: VectorRegistersExt<Reg, CustomError>,
610    [(); ExtState::ELEN as usize]:,
611    [(); ExtState::VLEN as usize]:,
612    [(); ExtState::VLENB as usize]:,
613    Memory: VirtualMemory,
614    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
615    CustomError: fmt::Debug,
616{
617    let sew_bytes = usize::from(sew.bytes());
618    let src_sew_bytes = sew_bytes / usize::from(factor);
619    let src_sew_bits = (u32::from(sew.bits())) / u32::from(factor);
620
621    // SAFETY: `vl <= VLEN`
622    let mask_buf = unsafe { snapshot_mask(state.ext_state.read_vreg(), vm, vl) };
623    let vd_base = vd.bits();
624    let vs2_base = vs2.bits();
625
626    for i in vstart..vl {
627        if !mask_bit(&mask_buf, i) {
628            continue;
629        }
630        // SAFETY: vs2 group covers `vl` narrow elements
631        let raw = unsafe {
632            read_element_u64(
633                state.ext_state.read_vreg(),
634                usize::from(vs2_base),
635                i,
636                src_sew_bytes,
637            )
638        };
639        let result = if sign {
640            sign_extend_bits(raw, src_sew_bits).cast_unsigned()
641        } else {
642            raw
643        };
644        // SAFETY: vd group covers `vl` wide elements
645        unsafe {
646            write_element_u64(state.ext_state.write_vreg(), vd_base, i, sew_bytes, result);
647        }
648    }
649    state.ext_state.mark_vs_dirty();
650    state.ext_state.reset_vstart();
651}