Skip to main content

ab_riscv_interpreter/v/zve64x/perm/
zve64x_perm_helpers.rs

1//! Opaque helpers for Zve64x extension
2
3use crate::v::vector_registers::VectorRegistersExt;
4pub use crate::v::zve64x::arith::zve64x_arith_helpers::check_vreg_group_alignment;
5use crate::v::zve64x::arith::zve64x_arith_helpers::{read_element_u64, write_element_u64};
6use crate::v::zve64x::load::zve64x_load_helpers::{mask_bit, snapshot_mask};
7use crate::v::zve64x::zve64x_helpers::INSTRUCTION_SIZE;
8use crate::{ExecutionError, InterpreterState, ProgramCounter, VirtualMemory};
9use ab_riscv_primitives::instructions::v::Vsew;
10use ab_riscv_primitives::registers::general_purpose::Register;
11use ab_riscv_primitives::registers::vector::VReg;
12use core::fmt;
13
14/// Check that register groups `[a, a+count)` and `[b, b+count)` do not overlap.
15///
16/// Both groups must have the same size `count`. For groups of different sizes use
17/// [`check_no_overlap_asymmetric`].
18#[inline(always)]
19#[doc(hidden)]
20pub fn check_no_overlap<Reg, ExtState, Memory, PC, IH, CustomError>(
21    state: &InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
22    a: VReg,
23    b: VReg,
24    count: u8,
25) -> Result<(), ExecutionError<Reg::Type, CustomError>>
26where
27    Reg: Register,
28    [(); Reg::N]:,
29    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
30{
31    let a_start = u16::from(a.bits());
32    let b_start = u16::from(b.bits());
33    let count = u16::from(count);
34    // Intervals [a_start, a_start+count) and [b_start, b_start+count) overlap iff
35    // each starts before the other ends. Arithmetic is widened to u16 to avoid u8 overflow
36    // (e.g., b_start=30 + count=8 = 38, which overflows u8).
37    if a_start < b_start + count && b_start < a_start + count {
38        return Err(ExecutionError::IllegalInstruction {
39            address: state.instruction_fetcher.old_pc(INSTRUCTION_SIZE),
40        });
41    }
42    Ok(())
43}
44
45/// Check that register group `[a, a+a_count)` does not overlap `[b, b+b_count)`.
46///
47/// Unlike [`check_no_overlap`], the two groups are allowed to have different sizes.
48/// Used for `vrgatherei16.vv` where vd/vs2 use LMUL-derived `group_regs` and vs1
49/// uses EEW=16-derived `index_group_regs`.
50#[inline(always)]
51#[doc(hidden)]
52pub fn check_no_overlap_asymmetric<Reg, ExtState, Memory, PC, IH, CustomError>(
53    state: &InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
54    a: VReg,
55    a_count: u8,
56    b: VReg,
57    b_count: u8,
58) -> Result<(), ExecutionError<Reg::Type, CustomError>>
59where
60    Reg: Register,
61    [(); Reg::N]:,
62    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
63{
64    let a_start = u16::from(a.bits());
65    let b_start = u16::from(b.bits());
66    let a_count = u16::from(a_count);
67    let b_count = u16::from(b_count);
68    // Intervals [a_start, a_start+a_count) and [b_start, b_start+b_count) overlap iff
69    // each starts before the other ends.
70    if a_start < b_start + b_count && b_start < a_start + a_count {
71        return Err(ExecutionError::IllegalInstruction {
72            address: state.instruction_fetcher.old_pc(INSTRUCTION_SIZE),
73        });
74    }
75    Ok(())
76}
77
78/// Read element 0 of register `base_reg` as `u64`, zero-extended.
79///
80/// # Safety
81/// `base_reg < 32` and `sew.bytes() <= VLENB` must hold.
82#[inline(always)]
83pub unsafe fn read_element_0_u64<const VLENB: usize>(
84    vreg: &[[u8; VLENB]; 32],
85    base_reg: u8,
86    sew: Vsew,
87) -> u64 {
88    let sew_bytes = usize::from(sew.bytes());
89    // SAFETY: `base_reg < 32` by VReg invariant
90    let reg = unsafe { vreg.get_unchecked(usize::from(base_reg)) };
91    let mut buf = [0u8; 8];
92    // SAFETY: `sew_bytes <= VLENB` for all legal vtype; `sew_bytes <= 8`
93    unsafe { buf.get_unchecked_mut(..sew_bytes) }
94        .copy_from_slice(unsafe { reg.get_unchecked(..sew_bytes) });
95    u64::from_le_bytes(buf)
96}
97
98/// Write element 0 of register `base_reg` from the low `sew_bytes` of `value`.
99///
100/// # Safety
101/// `base_reg < 32` and `sew.bytes() <= VLENB` must hold.
102#[inline(always)]
103pub unsafe fn write_element_0_u64<const VLENB: usize>(
104    vreg: &mut [[u8; VLENB]; 32],
105    base_reg: u8,
106    sew: Vsew,
107    value: u64,
108) {
109    let sew_bytes = usize::from(sew.bytes());
110    let buf = value.to_le_bytes();
111    // SAFETY: `base_reg < 32` by VReg invariant
112    let reg = unsafe { vreg.get_unchecked_mut(usize::from(base_reg)) };
113    // SAFETY: `sew_bytes <= VLENB`; `sew_bytes <= 8`
114    unsafe { reg.get_unchecked_mut(..sew_bytes) }
115        .copy_from_slice(unsafe { buf.get_unchecked(..sew_bytes) });
116}
117
118/// Sign-extend the low `sew.bits()` of `val` to the register type width.
119///
120/// The arithmetic is performed entirely in 64-bit signed integer space: we shift the SEW-wide
121/// value left to place its sign bit at bit 63, then arithmetic-right-shift back to propagate it.
122/// The resulting `u64` is then narrowed to `Reg::Type` (32 or 64 bits) by combining via
123/// `From<u32>` - the only integer conversion in the `Register::Type` trait bounds.
124///
125/// For RV32 (`Reg::XLEN == 32`) the low 32 bits are already the correct sign-extended result
126/// because the arithmetic shift propagates the sign across all 64 bits and then we discard the
127/// upper half.
128///
129/// For RV64 (`Reg::XLEN == 64`) we must preserve all 64 bits. Since `Reg::Type: From<u32>` and
130/// `Reg::Type: Shl<u8>`, we reconstruct the 64-bit value by OR-ing two 32-bit halves shifted
131/// into position.
132#[inline(always)]
133pub fn sign_extend_to_reg<Reg>(val: u64, sew: Vsew) -> Reg::Type
134where
135    Reg: Register,
136{
137    let sew_bits = u32::from(sew.bits());
138    // `shift` is in [0, 64). When sew_bits == 64, shift == 0 and the value is unchanged.
139    let shift = u64::BITS - sew_bits;
140    // Cast to i64 so the right-shift is arithmetic (sign-extending).
141    let sign_extended = (val.cast_signed() << shift) >> shift;
142    let raw = sign_extended.cast_unsigned();
143    if Reg::XLEN == u64::BITS as u8 {
144        // RV64: preserve all 64 bits by splitting into two u32 halves.
145        let lo = Reg::Type::from(raw as u32);
146        let hi = Reg::Type::from((raw >> u32::BITS) as u32);
147        lo | (hi << 32u8)
148    } else {
149        // RV32: the low 32 bits are the correctly truncated result.
150        Reg::Type::from(raw as u32)
151    }
152}
153
154/// Execute a vslideup operation.
155///
156/// Elements `vstart..min(offset, vl)` in vd are unchanged.
157/// Elements `max(vstart, offset)..vl` where mask is active get vs2[i - offset].
158///
159/// # Safety
160/// - `vd` and `vs2` are validly aligned and non-overlapping (verified by caller).
161/// - `vl <= group_regs * VLENB / sew_bytes`.
162/// - When `vm=false`: `vd.bits() != 0`.
163#[inline(always)]
164#[expect(clippy::too_many_arguments, reason = "Internal API")]
165#[doc(hidden)]
166pub unsafe fn execute_slideup<Reg, ExtState, Memory, PC, IH, CustomError>(
167    state: &mut InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
168    vd: VReg,
169    vs2: VReg,
170    vm: bool,
171    vl: u32,
172    vstart: u32,
173    sew: Vsew,
174    offset: u64,
175) where
176    Reg: Register,
177    [(); Reg::N]:,
178    ExtState: VectorRegistersExt<Reg, CustomError>,
179    [(); ExtState::ELEN as usize]:,
180    [(); ExtState::VLEN as usize]:,
181    [(); ExtState::VLENB as usize]:,
182    Memory: VirtualMemory,
183    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
184    CustomError: fmt::Debug,
185{
186    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
187    let mask_buf = unsafe { snapshot_mask(state.ext_state.read_vreg(), vm, vl) };
188    let vd_base = vd.bits();
189    let vs2_base = vs2.bits();
190    // Per spec §16.3.1: elements 0..offset are never written (vd keeps its value).
191    // The active range starts at max(vstart, offset).
192    let start = vstart.max(offset.min(u64::from(u32::MAX)) as u32);
193    for i in start..vl {
194        if !mask_bit(&mask_buf, i) {
195            continue;
196        }
197        let src_idx = i as u64 - offset;
198        // SAFETY: src_idx < vl <= group_regs * elems_per_reg, so source element is in range
199        let val = unsafe {
200            read_element_u64(
201                state.ext_state.read_vreg(),
202                usize::from(vs2_base),
203                src_idx as u32,
204                sew,
205            )
206        };
207        // SAFETY: i < vl <= group_regs * elems_per_reg, so dest element is in range
208        unsafe {
209            write_element_u64(state.ext_state.write_vreg(), vd_base, i, sew, val);
210        }
211    }
212    state.ext_state.mark_vs_dirty();
213    state.ext_state.reset_vstart();
214}
215
216/// Execute a vslidedown operation.
217///
218/// Element `vd[i] = vs2[i + offset]` if `i + offset < vlmax`, else `0`.
219///
220/// # Safety
221/// - `vd` and `vs2` are validly aligned (verified by caller); overlap is permitted.
222/// - `vl <= vlmax`.
223/// - When `vm=false`: `vd.bits() != 0`.
224#[inline(always)]
225#[expect(clippy::too_many_arguments, reason = "Internal API")]
226#[doc(hidden)]
227pub unsafe fn execute_slidedown<Reg, ExtState, Memory, PC, IH, CustomError>(
228    state: &mut InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
229    vd: VReg,
230    vs2: VReg,
231    vm: bool,
232    vl: u32,
233    vstart: u32,
234    sew: Vsew,
235    vlmax: u32,
236    offset: u64,
237) where
238    Reg: Register,
239    [(); Reg::N]:,
240    ExtState: VectorRegistersExt<Reg, CustomError>,
241    [(); ExtState::ELEN as usize]:,
242    [(); ExtState::VLEN as usize]:,
243    [(); ExtState::VLENB as usize]:,
244    Memory: VirtualMemory,
245    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
246    CustomError: fmt::Debug,
247{
248    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
249    let mask_buf = unsafe { snapshot_mask(state.ext_state.read_vreg(), vm, vl) };
250    let vd_base = vd.bits();
251    let vs2_base = vs2.bits();
252    for i in vstart..vl {
253        if !mask_bit(&mask_buf, i) {
254            continue;
255        }
256        // Use checked_add to guard against offset being so large that i + offset overflows u64.
257        // Any value that wraps past u64::MAX is trivially >= vlmax, so the spec requires vd[i]=0.
258        let val = if let Some(src_idx) = u64::from(i).checked_add(offset)
259            && src_idx < u64::from(vlmax)
260        {
261            // SAFETY: src_idx < vlmax <= group_regs * elems_per_reg, so element is in range
262            unsafe {
263                read_element_u64(
264                    state.ext_state.read_vreg(),
265                    usize::from(vs2_base),
266                    src_idx as u32,
267                    sew,
268                )
269            }
270        } else {
271            0
272        };
273        // SAFETY: i < vl <= vlmax <= group_regs * elems_per_reg
274        unsafe {
275            write_element_u64(state.ext_state.write_vreg(), vd_base, i, sew, val);
276        }
277    }
278    state.ext_state.mark_vs_dirty();
279    state.ext_state.reset_vstart();
280}
281
282/// Execute a vslide1up operation.
283///
284/// Element 0 of vd gets `scalar` (when active and vl > 0).
285/// Element `i` for `1 <= i < vl` gets `vs2[i - 1]`.
286/// vd must not overlap vs2.
287///
288/// # Safety
289/// - `vd` and `vs2` are validly aligned and non-overlapping (verified by caller).
290/// - `vl <= group_regs * VLENB / sew_bytes`.
291/// - When `vm=false`: `vd.bits() != 0`.
292#[inline(always)]
293#[expect(clippy::too_many_arguments, reason = "Internal API")]
294#[doc(hidden)]
295pub unsafe fn execute_slide1up<Reg, ExtState, Memory, PC, IH, CustomError>(
296    state: &mut InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
297    vd: VReg,
298    vs2: VReg,
299    vm: bool,
300    vl: u32,
301    vstart: u32,
302    sew: Vsew,
303    scalar: u64,
304) where
305    Reg: Register,
306    [(); Reg::N]:,
307    ExtState: VectorRegistersExt<Reg, CustomError>,
308    [(); ExtState::ELEN as usize]:,
309    [(); ExtState::VLEN as usize]:,
310    [(); ExtState::VLENB as usize]:,
311    Memory: VirtualMemory,
312    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
313    CustomError: fmt::Debug,
314{
315    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
316    let mask_buf = unsafe { snapshot_mask(state.ext_state.read_vreg(), vm, vl) };
317    let vd_base = vd.bits();
318    let vs2_base = vs2.bits();
319    for i in vstart..vl {
320        if !mask_bit(&mask_buf, i) {
321            continue;
322        }
323        let val = if i == 0 {
324            scalar
325        } else {
326            // SAFETY: i - 1 < vl <= group_regs * elems_per_reg
327            unsafe {
328                read_element_u64(
329                    state.ext_state.read_vreg(),
330                    usize::from(vs2_base),
331                    i - 1,
332                    sew,
333                )
334            }
335        };
336        // SAFETY: i < vl <= group_regs * elems_per_reg
337        unsafe {
338            write_element_u64(state.ext_state.write_vreg(), vd_base, i, sew, val);
339        }
340    }
341    state.ext_state.mark_vs_dirty();
342    state.ext_state.reset_vstart();
343}
344
345/// Execute a vslide1down operation.
346///
347/// Element `vd[i] = vs2[i + 1]` for `i < vl - 1`; element `vd[vl - 1]` gets `scalar`.
348///
349/// Overlap between `vd` and `vs2` is permitted by the spec. When they share the same register
350/// group base (exact overlap), ascending iteration is still correct: each write goes to byte range
351/// `[i*sew, (i+1)*sew)` while the subsequent read comes from `[(i+1)*sew, (i+2)*sew)`. These
352/// ranges are adjacent and non-overlapping, so writing element `i` never corrupts the source bytes
353/// of element `i+1`.
354///
355/// # Safety
356/// - `vd` and `vs2` are validly aligned (verified by caller); overlap is permitted.
357/// - `vl <= group_regs * VLENB / sew_bytes`.
358/// - When `vm=false`: `vd.bits() != 0`.
359#[inline(always)]
360#[expect(clippy::too_many_arguments, reason = "Internal API")]
361#[doc(hidden)]
362pub unsafe fn execute_slide1down<Reg, ExtState, Memory, PC, IH, CustomError>(
363    state: &mut InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
364    vd: VReg,
365    vs2: VReg,
366    vm: bool,
367    vl: u32,
368    vstart: u32,
369    sew: Vsew,
370    scalar: u64,
371) where
372    Reg: Register,
373    [(); Reg::N]:,
374    ExtState: VectorRegistersExt<Reg, CustomError>,
375    [(); ExtState::ELEN as usize]:,
376    [(); ExtState::VLEN as usize]:,
377    [(); ExtState::VLENB as usize]:,
378    Memory: VirtualMemory,
379    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
380    CustomError: fmt::Debug,
381{
382    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
383    let mask_buf = unsafe { snapshot_mask(state.ext_state.read_vreg(), vm, vl) };
384    let vd_base = vd.bits();
385    let vs2_base = vs2.bits();
386    for i in vstart..vl {
387        if !mask_bit(&mask_buf, i) {
388            continue;
389        }
390        let val = if i + 1 < vl {
391            // SAFETY: i + 1 < vl <= group_regs * elems_per_reg
392            unsafe {
393                read_element_u64(
394                    state.ext_state.read_vreg(),
395                    usize::from(vs2_base),
396                    i + 1,
397                    sew,
398                )
399            }
400        } else {
401            scalar
402        };
403        // SAFETY: i < vl <= group_regs * elems_per_reg
404        unsafe {
405            write_element_u64(state.ext_state.write_vreg(), vd_base, i, sew, val);
406        }
407    }
408    state.ext_state.mark_vs_dirty();
409    state.ext_state.reset_vstart();
410}
411
412/// Execute vrgather.vv: `vd[i] = (vs1[i] < vlmax) ? vs2[vs1[i]] : 0`.
413///
414/// # Safety
415/// - `vd`, `vs2`, and `vs1` are validly aligned and mutually non-overlapping (verified by caller).
416/// - `vl <= vlmax`.
417/// - When `vm=false`: `vd.bits() != 0`.
418#[inline(always)]
419#[expect(clippy::too_many_arguments, reason = "Internal API")]
420#[doc(hidden)]
421pub unsafe fn execute_rgather_vv<Reg, ExtState, Memory, PC, IH, CustomError>(
422    state: &mut InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
423    vd: VReg,
424    vs2: VReg,
425    vs1: VReg,
426    vm: bool,
427    vl: u32,
428    vstart: u32,
429    sew: Vsew,
430    vlmax: u32,
431) where
432    Reg: Register,
433    [(); Reg::N]:,
434    ExtState: VectorRegistersExt<Reg, CustomError>,
435    [(); ExtState::ELEN as usize]:,
436    [(); ExtState::VLEN as usize]:,
437    [(); ExtState::VLENB as usize]:,
438    Memory: VirtualMemory,
439    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
440    CustomError: fmt::Debug,
441{
442    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
443    let mask_buf = unsafe { snapshot_mask(state.ext_state.read_vreg(), vm, vl) };
444    let vd_base = vd.bits();
445    let vs2_base = vs2.bits();
446    let vs1_base = vs1.bits();
447    for i in vstart..vl {
448        if !mask_bit(&mask_buf, i) {
449            continue;
450        }
451        // SAFETY: i < vl <= group_regs * elems_per_reg for vs1
452        let index =
453            unsafe { read_element_u64(state.ext_state.read_vreg(), usize::from(vs1_base), i, sew) };
454        let val = if index < u64::from(vlmax) {
455            // SAFETY: index < vlmax <= group_regs * elems_per_reg for vs2
456            unsafe {
457                read_element_u64(
458                    state.ext_state.read_vreg(),
459                    usize::from(vs2_base),
460                    index as u32,
461                    sew,
462                )
463            }
464        } else {
465            0u64
466        };
467        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
468        unsafe {
469            write_element_u64(state.ext_state.write_vreg(), vd_base, i, sew, val);
470        }
471    }
472    state.ext_state.mark_vs_dirty();
473    state.ext_state.reset_vstart();
474}
475
476/// Execute vrgather.vx / vrgather.vi: all active elements get `vs2[index]` or `0`.
477///
478/// # Safety
479/// - `vd` and `vs2` are validly aligned and non-overlapping (verified by caller).
480/// - `vl <= vlmax`.
481/// - When `vm=false`: `vd.bits() != 0`.
482#[inline(always)]
483#[expect(clippy::too_many_arguments, reason = "Internal API")]
484#[doc(hidden)]
485pub unsafe fn execute_rgather_scalar<Reg, ExtState, Memory, PC, IH, CustomError>(
486    state: &mut InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
487    vd: VReg,
488    vs2: VReg,
489    vm: bool,
490    vl: u32,
491    vstart: u32,
492    sew: Vsew,
493    vlmax: u32,
494    index: u64,
495) where
496    Reg: Register,
497    [(); Reg::N]:,
498    ExtState: VectorRegistersExt<Reg, CustomError>,
499    [(); ExtState::ELEN as usize]:,
500    [(); ExtState::VLEN as usize]:,
501    [(); ExtState::VLENB as usize]:,
502    Memory: VirtualMemory,
503    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
504    CustomError: fmt::Debug,
505{
506    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
507    let mask_buf = unsafe { snapshot_mask(state.ext_state.read_vreg(), vm, vl) };
508    let vd_base = vd.bits();
509    let vs2_base = vs2.bits();
510    // Pre-compute the gathered value; it's the same for all elements.
511    let val = if index < u64::from(vlmax) {
512        // SAFETY: index < vlmax <= group_regs * elems_per_reg for vs2
513        unsafe {
514            read_element_u64(
515                state.ext_state.read_vreg(),
516                usize::from(vs2_base),
517                index as u32,
518                sew,
519            )
520        }
521    } else {
522        0u64
523    };
524    for i in vstart..vl {
525        if !mask_bit(&mask_buf, i) {
526            continue;
527        }
528        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
529        unsafe {
530            write_element_u64(state.ext_state.write_vreg(), vd_base, i, sew, val);
531        }
532    }
533    state.ext_state.mark_vs_dirty();
534    state.ext_state.reset_vstart();
535}
536
537/// Execute vrgatherei16.vv: `vd[i] = (vs1_16[i] < vlmax) ? vs2[vs1_16[i]] : 0`.
538///
539/// `vs1` always uses EEW=16 regardless of SEW. `vl` must not exceed the index register group
540/// capacity, i.e. `vl <= index_group_regs * VLENB / 2` (VLENB/2 = elems per register at EEW=16).
541///
542/// # Safety
543/// - `vd`, `vs2`, and `vs1` are validly aligned and mutually non-overlapping (verified by caller).
544/// - `vl <= vlmax` (for the data register group) AND `vl <= index_group_regs * VLENB / 2` (for the
545///   index register group).
546/// - When `vm=false`: `vd.bits() != 0`.
547#[inline(always)]
548#[expect(clippy::too_many_arguments, reason = "Internal API")]
549#[doc(hidden)]
550pub unsafe fn execute_rgatherei16<Reg, ExtState, Memory, PC, IH, CustomError>(
551    state: &mut InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
552    vd: VReg,
553    vs2: VReg,
554    vs1: VReg,
555    vm: bool,
556    vl: u32,
557    vstart: u32,
558    sew: Vsew,
559    vlmax: u32,
560    index_group_regs: u8,
561) where
562    Reg: Register,
563    [(); Reg::N]:,
564    ExtState: VectorRegistersExt<Reg, CustomError>,
565    [(); ExtState::ELEN as usize]:,
566    [(); ExtState::VLEN as usize]:,
567    [(); ExtState::VLENB as usize]:,
568    Memory: VirtualMemory,
569    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
570    CustomError: fmt::Debug,
571{
572    // Maximum number of EEW=16 elements the index register group can hold.
573    // Each register holds VLENB / 2 elements at EEW=16.
574    let index_capacity = u32::from(index_group_regs) * (ExtState::VLENB / 2);
575    // `vl` must not exceed either the data VLMAX or the index register group capacity.
576    // Both bounds are guaranteed by the caller; this debug assertion catches misuse early.
577    debug_assert!(
578        vl <= vlmax && vl <= index_capacity,
579        "vl={vl} exceeds vlmax={vlmax} or index_capacity={index_capacity}"
580    );
581    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
582    let mask_buf = unsafe { snapshot_mask(state.ext_state.read_vreg(), vm, vl) };
583    let vd_base = vd.bits();
584    let vs2_base = vs2.bits();
585    let vs1_base = vs1.bits();
586    for i in vstart..vl {
587        if !mask_bit(&mask_buf, i) {
588            continue;
589        }
590        // Read 16-bit index from vs1; EEW=16 always.
591        // SAFETY: i < vl <= index_capacity = index_group_regs * (VLENB/2), so element i
592        // fits within the index register group.
593        let index = unsafe {
594            read_element_u64(
595                state.ext_state.read_vreg(),
596                usize::from(vs1_base),
597                i,
598                Vsew::E16,
599            )
600        };
601        let val = if index < u64::from(vlmax) {
602            // SAFETY: index < vlmax <= group_regs * elems_per_reg for vs2
603            unsafe {
604                read_element_u64(
605                    state.ext_state.read_vreg(),
606                    usize::from(vs2_base),
607                    index as u32,
608                    sew,
609                )
610            }
611        } else {
612            0u64
613        };
614        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
615        unsafe {
616            write_element_u64(state.ext_state.write_vreg(), vd_base, i, sew, val);
617        }
618    }
619    state.ext_state.mark_vs_dirty();
620    state.ext_state.reset_vstart();
621}
622
623/// Execute vmerge.vvm / vmv.v.v.
624///
625/// When `vm=true` (vmv.v.v): all active elements `vstart..vl` get `vs1[i]`; vs2 unused.
626/// When `vm=false` (vmerge.vvm): active elements where `v0[i]=1` get `vs1[i]`,
627/// inactive elements get `vs2[i]`.
628///
629/// # Safety
630/// - `vd` and `vs1` are validly aligned (verified by caller).
631/// - When `vm=false`: `vs2` is validly aligned and `vd` does not overlap v0 (verified by caller).
632/// - `vl <= group_regs * VLENB / sew_bytes`.
633#[inline(always)]
634#[expect(clippy::too_many_arguments, reason = "Internal API")]
635#[doc(hidden)]
636pub unsafe fn execute_merge_vv<Reg, ExtState, Memory, PC, IH, CustomError>(
637    state: &mut InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
638    vd: VReg,
639    vs2: VReg,
640    vs1: VReg,
641    vm: bool,
642    vl: u32,
643    vstart: u32,
644    sew: Vsew,
645) where
646    Reg: Register,
647    [(); Reg::N]:,
648    ExtState: VectorRegistersExt<Reg, CustomError>,
649    [(); ExtState::ELEN as usize]:,
650    [(); ExtState::VLEN as usize]:,
651    [(); ExtState::VLENB as usize]:,
652    Memory: VirtualMemory,
653    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
654    CustomError: fmt::Debug,
655{
656    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`.
657    // For vmv.v.v (vm=true) the mask is all-ones so snapshot_mask is still valid.
658    let mask_buf = unsafe { snapshot_mask(state.ext_state.read_vreg(), vm, vl) };
659    let vd_base = vd.bits();
660    let vs1_base = vs1.bits();
661    let vs2_base = vs2.bits();
662    for i in vstart..vl {
663        let mask_set = mask_bit(&mask_buf, i);
664        let val = if mask_set {
665            // SAFETY: i < vl <= group_regs * elems_per_reg for vs1
666            unsafe { read_element_u64(state.ext_state.read_vreg(), usize::from(vs1_base), i, sew) }
667        } else {
668            // mask_set=false only reachable when vm=false (vmerge path).
669            // SAFETY: i < vl <= group_regs * elems_per_reg for vs2
670            unsafe { read_element_u64(state.ext_state.read_vreg(), usize::from(vs2_base), i, sew) }
671        };
672        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
673        unsafe { write_element_u64(state.ext_state.write_vreg(), vd_base, i, sew, val) };
674    }
675    state.ext_state.mark_vs_dirty();
676    state.ext_state.reset_vstart();
677}
678
679/// Execute vmerge.vxm / vmerge.vim / vmv.v.x / vmv.v.i.
680///
681/// When `vm=true`: all active elements `vstart..vl` get `scalar`; vs2 unused.
682/// When `vm=false`: active elements where `v0[i]=1` get `scalar`,
683/// inactive elements get `vs2[i]`.
684///
685/// # Safety
686/// - `vd` is validly aligned (verified by caller).
687/// - When `vm=false`: `vs2` is validly aligned and `vd` does not overlap v0 (verified by caller).
688/// - `vl <= group_regs * VLENB / sew_bytes`.
689#[inline(always)]
690#[expect(clippy::too_many_arguments, reason = "Internal API")]
691#[doc(hidden)]
692pub unsafe fn execute_merge_scalar<Reg, ExtState, Memory, PC, IH, CustomError>(
693    state: &mut InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
694    vd: VReg,
695    vs2: VReg,
696    vm: bool,
697    vl: u32,
698    vstart: u32,
699    sew: Vsew,
700    scalar: u64,
701) where
702    Reg: Register,
703    [(); Reg::N]:,
704    ExtState: VectorRegistersExt<Reg, CustomError>,
705    [(); ExtState::ELEN as usize]:,
706    [(); ExtState::VLEN as usize]:,
707    [(); ExtState::VLENB as usize]:,
708    Memory: VirtualMemory,
709    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
710    CustomError: fmt::Debug,
711{
712    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`.
713    let mask_buf = unsafe { snapshot_mask(state.ext_state.read_vreg(), vm, vl) };
714    let vd_base = vd.bits();
715    let vs2_base = vs2.bits();
716    for i in vstart..vl {
717        let val = if mask_bit(&mask_buf, i) {
718            scalar
719        } else {
720            // SAFETY: i < vl <= group_regs * elems_per_reg for vs2
721            unsafe { read_element_u64(state.ext_state.read_vreg(), usize::from(vs2_base), i, sew) }
722        };
723        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
724        unsafe { write_element_u64(state.ext_state.write_vreg(), vd_base, i, sew, val) };
725    }
726    state.ext_state.mark_vs_dirty();
727    state.ext_state.reset_vstart();
728}
729
730/// Execute vcompress.vm: pack active elements of vs2 (under vs1 mask) sequentially into vd.
731///
732/// `vs1` is treated as an explicit mask register (single register, not LMUL-grouped).
733/// The output write index increments only for elements where `vs1[i]` is set.
734/// vd must not overlap vs1 or vs2.
735///
736/// `vstart` semantics (spec §16.6): the implementation resumes with exactly the number of
737/// elements already written to vd equal to the popcount of `vs1[0..vstart)`. Inputs at indices
738/// `0..vstart` are skipped; the first write goes to the output slot equal to that prefix popcount.
739///
740/// # Safety
741/// - `vd`, `vs2` are validly aligned and non-overlapping (verified by caller).
742/// - `vs1` does not overlap `vd` (verified by caller).
743/// - `vl <= VLMAX`.
744#[inline(always)]
745#[doc(hidden)]
746pub unsafe fn execute_compress<Reg, ExtState, Memory, PC, IH, CustomError>(
747    state: &mut InterpreterState<Reg, ExtState, Memory, PC, IH, CustomError>,
748    vd: VReg,
749    vs2: VReg,
750    vs1: VReg,
751    vl: u32,
752    vstart: u32,
753    sew: Vsew,
754) where
755    Reg: Register,
756    [(); Reg::N]:,
757    ExtState: VectorRegistersExt<Reg, CustomError>,
758    [(); ExtState::ELEN as usize]:,
759    [(); ExtState::VLEN as usize]:,
760    [(); ExtState::VLENB as usize]:,
761    Memory: VirtualMemory,
762    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
763    CustomError: fmt::Debug,
764{
765    let vd_base = vd.bits();
766    let vs2_base = vs2.bits();
767    let vs1_base = vs1.bits();
768    // Snapshot the entire vs1 mask register before any writes (vd != vs1 is verified by caller).
769    let mask_bytes = vl.div_ceil(u8::BITS) as usize;
770    let vreg = state.ext_state.read_vreg();
771    let mut vs1_buf = [0u8; { ExtState::VLENB as usize }];
772    // SAFETY: `mask_bytes <= VLENB` since `vl <= VLEN`; `vs1_base < 32`
773    unsafe {
774        vs1_buf.get_unchecked_mut(..mask_bytes).copy_from_slice(
775            vreg.get_unchecked(usize::from(vs1_base))
776                .get_unchecked(..mask_bytes),
777        );
778    }
779    // When vstart >= vl there are no source elements to process: skip directly to
780    // the end-of-instruction bookkeeping.  This also ensures the prefix scan below
781    // never iterates past vl.
782    if vstart >= vl {
783        state.ext_state.mark_vs_dirty();
784        state.ext_state.reset_vstart();
785        return;
786    }
787    // Per spec §16.6: resuming from `vstart` means `vstart` elements have already been written
788    // to vd. The first write slot is therefore the number of set bits in `vs1[0..vstart)`, not
789    // `vstart` itself (which would be wrong when the mask is sparse before `vstart`).
790    // The early-return above guarantees vstart < vl, so clamping to vstart.min(vl) == vstart.
791    let mut out_idx = (0..vstart).filter(|&j| mask_bit(&vs1_buf, j)).count() as u32;
792    for i in vstart..vl {
793        if !mask_bit(&vs1_buf, i) {
794            continue;
795        }
796        // SAFETY: i < vl <= group_regs * elems_per_reg for vs2
797        let val =
798            unsafe { read_element_u64(state.ext_state.read_vreg(), usize::from(vs2_base), i, sew) };
799        // SAFETY: out_idx <= popcount(vs1[0..vl)) <= vl <= group_regs * elems_per_reg for vd
800        unsafe {
801            write_element_u64(state.ext_state.write_vreg(), vd_base, out_idx, sew, val);
802        }
803        out_idx += 1;
804    }
805    state.ext_state.mark_vs_dirty();
806    state.ext_state.reset_vstart();
807}
808
809/// Copy `count` whole vector registers from `src_base` to `dst_base`.
810///
811/// No masking, no vtype dependency. Uses snapshot semantics: all source registers are read into
812/// a stack buffer before any destination registers are written, giving correct memmove-style
813/// behaviour for all overlap patterns (including partial overlap such as src=V0, dst=V1, count=2).
814///
815/// The stack allocation is at most 8 × VLENB bytes (`count <= 8` for vmv1r–vmv8r).
816///
817/// # Safety
818/// - `dst_base + count <= 32` and `src_base + count <= 32` (verified by caller via alignment
819///   checks).
820/// - `dst_base % count == 0` and `src_base % count == 0` (verified by caller).
821#[inline(always)]
822#[doc(hidden)]
823pub unsafe fn execute_whole_reg_move<const VLENB: usize>(
824    vreg: &mut [[u8; VLENB]; 32],
825    dst_base: u8,
826    src_base: u8,
827    count: u8,
828) {
829    let count = usize::from(count);
830    debug_assert!(count <= 8, "count must be <= 8 for vmvNr");
831    // Snapshot all source registers before writing any destination registers.
832    // This is correct for all overlap patterns without direction-dependent logic.
833    let mut tmp = [[0u8; VLENB]; 8];
834    for (k, item) in tmp.iter_mut().enumerate().take(count) {
835        let src_idx = usize::from(src_base) + k;
836        // SAFETY: src_idx < 32 per caller guarantee
837        *item = *unsafe { vreg.get_unchecked(src_idx) };
838    }
839    for (k, item) in tmp.iter().enumerate().take(count) {
840        let dst_idx = usize::from(dst_base) + k;
841        // SAFETY: dst_idx < 32 per caller guarantee
842        *unsafe { vreg.get_unchecked_mut(dst_idx) } = *item;
843    }
844}