Skip to main content

ab_riscv_interpreter/v/zve64x/perm/
zve64x_perm_helpers.rs

1//! Opaque helpers for Zve64x extension
2
3use crate::v::vector_registers::VectorRegistersExt;
4pub use crate::v::zve64x::arith::zve64x_arith_helpers::check_vreg_group_alignment;
5use crate::v::zve64x::arith::zve64x_arith_helpers::{read_element_u64, write_element_u64};
6use crate::v::zve64x::load::zve64x_load_helpers::{mask_bit, snapshot_mask};
7use crate::v::zve64x::zve64x_helpers::INSTRUCTION_SIZE;
8use crate::{ExecutionError, ProgramCounter};
9use ab_riscv_primitives::prelude::*;
10use core::fmt;
11
12/// Check that register groups `[a, a+count)` and `[b, b+count)` do not overlap.
13///
14/// Both groups must have the same size `count`. For groups of different sizes use
15/// [`check_no_overlap_asymmetric`].
16#[inline(always)]
17#[doc(hidden)]
18pub fn check_no_overlap<Reg, Memory, PC, CustomError>(
19    program_counter: &PC,
20    a: VReg,
21    b: VReg,
22    count: u8,
23) -> Result<(), ExecutionError<Reg::Type, CustomError>>
24where
25    Reg: Register,
26    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
27{
28    let a_start = u16::from(a.bits());
29    let b_start = u16::from(b.bits());
30    let count = u16::from(count);
31    // Intervals [a_start, a_start+count) and [b_start, b_start+count) overlap iff
32    // each starts before the other ends. Arithmetic is widened to u16 to avoid u8 overflow
33    // (e.g., b_start=30 + count=8 = 38, which overflows u8).
34    if a_start < b_start + count && b_start < a_start + count {
35        return Err(ExecutionError::IllegalInstruction {
36            address: program_counter.old_pc(INSTRUCTION_SIZE),
37        });
38    }
39    Ok(())
40}
41
42/// Check that register group `[a, a+a_count)` does not overlap `[b, b+b_count)`.
43///
44/// Unlike [`check_no_overlap`], the two groups are allowed to have different sizes.
45/// Used for `vrgatherei16.vv` where vd/vs2 use LMUL-derived `group_regs` and vs1
46/// uses EEW=16-derived `index_group_regs`.
47#[inline(always)]
48#[doc(hidden)]
49pub fn check_no_overlap_asymmetric<Reg, Memory, PC, CustomError>(
50    program_counter: &PC,
51    a: VReg,
52    a_count: u8,
53    b: VReg,
54    b_count: u8,
55) -> Result<(), ExecutionError<Reg::Type, CustomError>>
56where
57    Reg: Register,
58    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
59{
60    let a_start = u16::from(a.bits());
61    let b_start = u16::from(b.bits());
62    let a_count = u16::from(a_count);
63    let b_count = u16::from(b_count);
64    // Intervals [a_start, a_start+a_count) and [b_start, b_start+b_count) overlap iff
65    // each starts before the other ends.
66    if a_start < b_start + b_count && b_start < a_start + a_count {
67        return Err(ExecutionError::IllegalInstruction {
68            address: program_counter.old_pc(INSTRUCTION_SIZE),
69        });
70    }
71    Ok(())
72}
73
74/// Read element 0 of register `base_reg` as `u64`, zero-extended.
75///
76/// # Safety
77/// `base_reg < 32` and `sew.bytes() <= VLENB` must hold.
78#[inline(always)]
79pub unsafe fn read_element_0_u64<const VLENB: usize>(
80    vreg: &[[u8; VLENB]; 32],
81    base_reg: u8,
82    sew: Vsew,
83) -> u64 {
84    let sew_bytes = usize::from(sew.bytes());
85    // SAFETY: `base_reg < 32` by VReg invariant
86    let reg = unsafe { vreg.get_unchecked(usize::from(base_reg)) };
87    let mut buf = [0u8; 8];
88    // SAFETY: `sew_bytes <= VLENB` for all legal vtype; `sew_bytes <= 8`
89    unsafe { buf.get_unchecked_mut(..sew_bytes) }
90        .copy_from_slice(unsafe { reg.get_unchecked(..sew_bytes) });
91    u64::from_le_bytes(buf)
92}
93
94/// Write element 0 of register `base_reg` from the low `sew_bytes` of `value`.
95///
96/// # Safety
97/// `base_reg < 32` and `sew.bytes() <= VLENB` must hold.
98#[inline(always)]
99pub unsafe fn write_element_0_u64<const VLENB: usize>(
100    vreg: &mut [[u8; VLENB]; 32],
101    base_reg: u8,
102    sew: Vsew,
103    value: u64,
104) {
105    let sew_bytes = usize::from(sew.bytes());
106    let buf = value.to_le_bytes();
107    // SAFETY: `base_reg < 32` by VReg invariant
108    let reg = unsafe { vreg.get_unchecked_mut(usize::from(base_reg)) };
109    // SAFETY: `sew_bytes <= VLENB`; `sew_bytes <= 8`
110    unsafe { reg.get_unchecked_mut(..sew_bytes) }
111        .copy_from_slice(unsafe { buf.get_unchecked(..sew_bytes) });
112}
113
114/// Sign-extend the low `sew.bits()` of `val` to the register type width.
115///
116/// The arithmetic is performed entirely in 64-bit signed integer space: we shift the SEW-wide
117/// value left to place its sign bit at bit 63, then arithmetic-right-shift back to propagate it.
118/// The resulting `u64` is then narrowed to `Reg::Type` (32 or 64 bits) by combining via
119/// `From<u32>` - the only integer conversion in the `Register::Type` trait bounds.
120///
121/// For RV32 (`Reg::XLEN == 32`) the low 32 bits are already the correct sign-extended result
122/// because the arithmetic shift propagates the sign across all 64 bits and then we discard the
123/// upper half.
124///
125/// For RV64 (`Reg::XLEN == 64`) we must preserve all 64 bits. Since `Reg::Type: From<u32>` and
126/// `Reg::Type: Shl<u8>`, we reconstruct the 64-bit value by OR-ing two 32-bit halves shifted
127/// into position.
128#[inline(always)]
129pub fn sign_extend_to_reg<Reg>(val: u64, sew: Vsew) -> Reg::Type
130where
131    Reg: Register,
132{
133    let sew_bits = u32::from(sew.bits());
134    // `shift` is in [0, 64). When sew_bits == 64, shift == 0 and the value is unchanged.
135    let shift = u64::BITS - sew_bits;
136    // Cast to i64 so the right-shift is arithmetic (sign-extending).
137    let sign_extended = (val.cast_signed() << shift) >> shift;
138    let raw = sign_extended.cast_unsigned();
139    if Reg::XLEN == u64::BITS as u8 {
140        // RV64: preserve all 64 bits by splitting into two u32 halves.
141        let lo = Reg::Type::from(raw as u32);
142        let hi = Reg::Type::from((raw >> u32::BITS) as u32);
143        lo | (hi << 32u8)
144    } else {
145        // RV32: the low 32 bits are the correctly truncated result.
146        Reg::Type::from(raw as u32)
147    }
148}
149
150/// Execute a vslideup operation.
151///
152/// Elements `vstart..min(offset, vl)` in vd are unchanged.
153/// Elements `max(vstart, offset)..vl` where mask is active get vs2[i - offset].
154///
155/// # Safety
156/// - `vd` and `vs2` are validly aligned and non-overlapping (verified by caller).
157/// - `vl <= group_regs * VLENB / sew_bytes`.
158/// - When `vm=false`: `vd.bits() != 0`.
159#[inline(always)]
160#[expect(clippy::too_many_arguments, reason = "Internal API")]
161#[doc(hidden)]
162pub unsafe fn execute_slideup<Reg, ExtState, CustomError>(
163    ext_state: &mut ExtState,
164    vd: VReg,
165    vs2: VReg,
166    vm: bool,
167    vl: u32,
168    vstart: u32,
169    sew: Vsew,
170    offset: u64,
171) where
172    Reg: Register,
173    ExtState: VectorRegistersExt<Reg, CustomError>,
174    [(); ExtState::ELEN as usize]:,
175    [(); ExtState::VLEN as usize]:,
176    [(); ExtState::VLENB as usize]:,
177    CustomError: fmt::Debug,
178{
179    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
180    let mask_buf = unsafe { snapshot_mask(ext_state.read_vreg(), vm, vl) };
181    let vd_base = vd.bits();
182    let vs2_base = vs2.bits();
183    // Per spec §16.3.1: elements 0..offset are never written (vd keeps its value).
184    // The active range starts at max(vstart, offset).
185    let start = vstart.max(offset.min(u64::from(u32::MAX)) as u32);
186    for i in start..vl {
187        if !mask_bit(&mask_buf, i) {
188            continue;
189        }
190        let src_idx = i as u64 - offset;
191        // SAFETY: src_idx < vl <= group_regs * elems_per_reg, so source element is in range
192        let val = unsafe {
193            read_element_u64(
194                ext_state.read_vreg(),
195                usize::from(vs2_base),
196                src_idx as u32,
197                sew,
198            )
199        };
200        // SAFETY: i < vl <= group_regs * elems_per_reg, so dest element is in range
201        unsafe {
202            write_element_u64(ext_state.write_vreg(), vd_base, i, sew, val);
203        }
204    }
205    ext_state.mark_vs_dirty();
206    ext_state.reset_vstart();
207}
208
209/// Execute a vslidedown operation.
210///
211/// Element `vd[i] = vs2[i + offset]` if `i + offset < vlmax`, else `0`.
212///
213/// # Safety
214/// - `vd` and `vs2` are validly aligned (verified by caller); overlap is permitted.
215/// - `vl <= vlmax`.
216/// - When `vm=false`: `vd.bits() != 0`.
217#[inline(always)]
218#[expect(clippy::too_many_arguments, reason = "Internal API")]
219#[doc(hidden)]
220pub unsafe fn execute_slidedown<Reg, ExtState, CustomError>(
221    ext_state: &mut ExtState,
222    vd: VReg,
223    vs2: VReg,
224    vm: bool,
225    vl: u32,
226    vstart: u32,
227    sew: Vsew,
228    vlmax: u32,
229    offset: u64,
230) where
231    Reg: Register,
232    ExtState: VectorRegistersExt<Reg, CustomError>,
233    [(); ExtState::ELEN as usize]:,
234    [(); ExtState::VLEN as usize]:,
235    [(); ExtState::VLENB as usize]:,
236    CustomError: fmt::Debug,
237{
238    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
239    let mask_buf = unsafe { snapshot_mask(ext_state.read_vreg(), vm, vl) };
240    let vd_base = vd.bits();
241    let vs2_base = vs2.bits();
242    for i in vstart..vl {
243        if !mask_bit(&mask_buf, i) {
244            continue;
245        }
246        // Use checked_add to guard against offset being so large that i + offset overflows u64.
247        // Any value that wraps past u64::MAX is trivially >= vlmax, so the spec requires vd[i]=0.
248        let val = if let Some(src_idx) = u64::from(i).checked_add(offset)
249            && src_idx < u64::from(vlmax)
250        {
251            // SAFETY: src_idx < vlmax <= group_regs * elems_per_reg, so element is in range
252            unsafe {
253                read_element_u64(
254                    ext_state.read_vreg(),
255                    usize::from(vs2_base),
256                    src_idx as u32,
257                    sew,
258                )
259            }
260        } else {
261            0
262        };
263        // SAFETY: i < vl <= vlmax <= group_regs * elems_per_reg
264        unsafe {
265            write_element_u64(ext_state.write_vreg(), vd_base, i, sew, val);
266        }
267    }
268    ext_state.mark_vs_dirty();
269    ext_state.reset_vstart();
270}
271
272/// Execute a vslide1up operation.
273///
274/// Element 0 of vd gets `scalar` (when active and vl > 0).
275/// Element `i` for `1 <= i < vl` gets `vs2[i - 1]`.
276/// vd must not overlap vs2.
277///
278/// # Safety
279/// - `vd` and `vs2` are validly aligned and non-overlapping (verified by caller).
280/// - `vl <= group_regs * VLENB / sew_bytes`.
281/// - When `vm=false`: `vd.bits() != 0`.
282#[inline(always)]
283#[expect(clippy::too_many_arguments, reason = "Internal API")]
284#[doc(hidden)]
285pub unsafe fn execute_slide1up<Reg, ExtState, CustomError>(
286    ext_state: &mut ExtState,
287    vd: VReg,
288    vs2: VReg,
289    vm: bool,
290    vl: u32,
291    vstart: u32,
292    sew: Vsew,
293    scalar: u64,
294) where
295    Reg: Register,
296    ExtState: VectorRegistersExt<Reg, CustomError>,
297    [(); ExtState::ELEN as usize]:,
298    [(); ExtState::VLEN as usize]:,
299    [(); ExtState::VLENB as usize]:,
300    CustomError: fmt::Debug,
301{
302    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
303    let mask_buf = unsafe { snapshot_mask(ext_state.read_vreg(), vm, vl) };
304    let vd_base = vd.bits();
305    let vs2_base = vs2.bits();
306    for i in vstart..vl {
307        if !mask_bit(&mask_buf, i) {
308            continue;
309        }
310        let val = if i == 0 {
311            scalar
312        } else {
313            // SAFETY: i - 1 < vl <= group_regs * elems_per_reg
314            unsafe { read_element_u64(ext_state.read_vreg(), usize::from(vs2_base), i - 1, sew) }
315        };
316        // SAFETY: i < vl <= group_regs * elems_per_reg
317        unsafe {
318            write_element_u64(ext_state.write_vreg(), vd_base, i, sew, val);
319        }
320    }
321    ext_state.mark_vs_dirty();
322    ext_state.reset_vstart();
323}
324
325/// Execute a vslide1down operation.
326///
327/// Element `vd[i] = vs2[i + 1]` for `i < vl - 1`; element `vd[vl - 1]` gets `scalar`.
328///
329/// Overlap between `vd` and `vs2` is permitted by the spec. When they share the same register
330/// group base (exact overlap), ascending iteration is still correct: each write goes to byte range
331/// `[i*sew, (i+1)*sew)` while the subsequent read comes from `[(i+1)*sew, (i+2)*sew)`. These
332/// ranges are adjacent and non-overlapping, so writing element `i` never corrupts the source bytes
333/// of element `i+1`.
334///
335/// # Safety
336/// - `vd` and `vs2` are validly aligned (verified by caller); overlap is permitted.
337/// - `vl <= group_regs * VLENB / sew_bytes`.
338/// - When `vm=false`: `vd.bits() != 0`.
339#[inline(always)]
340#[expect(clippy::too_many_arguments, reason = "Internal API")]
341#[doc(hidden)]
342pub unsafe fn execute_slide1down<Reg, ExtState, CustomError>(
343    ext_state: &mut ExtState,
344    vd: VReg,
345    vs2: VReg,
346    vm: bool,
347    vl: u32,
348    vstart: u32,
349    sew: Vsew,
350    scalar: u64,
351) where
352    Reg: Register,
353    ExtState: VectorRegistersExt<Reg, CustomError>,
354    [(); ExtState::ELEN as usize]:,
355    [(); ExtState::VLEN as usize]:,
356    [(); ExtState::VLENB as usize]:,
357    CustomError: fmt::Debug,
358{
359    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
360    let mask_buf = unsafe { snapshot_mask(ext_state.read_vreg(), vm, vl) };
361    let vd_base = vd.bits();
362    let vs2_base = vs2.bits();
363    for i in vstart..vl {
364        if !mask_bit(&mask_buf, i) {
365            continue;
366        }
367        let val = if i + 1 < vl {
368            // SAFETY: i + 1 < vl <= group_regs * elems_per_reg
369            unsafe { read_element_u64(ext_state.read_vreg(), usize::from(vs2_base), i + 1, sew) }
370        } else {
371            scalar
372        };
373        // SAFETY: i < vl <= group_regs * elems_per_reg
374        unsafe {
375            write_element_u64(ext_state.write_vreg(), vd_base, i, sew, val);
376        }
377    }
378    ext_state.mark_vs_dirty();
379    ext_state.reset_vstart();
380}
381
382/// Execute vrgather.vv: `vd[i] = (vs1[i] < vlmax) ? vs2[vs1[i]] : 0`.
383///
384/// # Safety
385/// - `vd`, `vs2`, and `vs1` are validly aligned and mutually non-overlapping (verified by caller).
386/// - `vl <= vlmax`.
387/// - When `vm=false`: `vd.bits() != 0`.
388#[inline(always)]
389#[expect(clippy::too_many_arguments, reason = "Internal API")]
390#[doc(hidden)]
391pub unsafe fn execute_rgather_vv<Reg, ExtState, CustomError>(
392    ext_state: &mut ExtState,
393    vd: VReg,
394    vs2: VReg,
395    vs1: VReg,
396    vm: bool,
397    vl: u32,
398    vstart: u32,
399    sew: Vsew,
400    vlmax: u32,
401) where
402    Reg: Register,
403    ExtState: VectorRegistersExt<Reg, CustomError>,
404    [(); ExtState::ELEN as usize]:,
405    [(); ExtState::VLEN as usize]:,
406    [(); ExtState::VLENB as usize]:,
407    CustomError: fmt::Debug,
408{
409    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
410    let mask_buf = unsafe { snapshot_mask(ext_state.read_vreg(), vm, vl) };
411    let vd_base = vd.bits();
412    let vs2_base = vs2.bits();
413    let vs1_base = vs1.bits();
414    for i in vstart..vl {
415        if !mask_bit(&mask_buf, i) {
416            continue;
417        }
418        // SAFETY: i < vl <= group_regs * elems_per_reg for vs1
419        let index =
420            unsafe { read_element_u64(ext_state.read_vreg(), usize::from(vs1_base), i, sew) };
421        let val = if index < u64::from(vlmax) {
422            // SAFETY: index < vlmax <= group_regs * elems_per_reg for vs2
423            unsafe {
424                read_element_u64(
425                    ext_state.read_vreg(),
426                    usize::from(vs2_base),
427                    index as u32,
428                    sew,
429                )
430            }
431        } else {
432            0u64
433        };
434        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
435        unsafe {
436            write_element_u64(ext_state.write_vreg(), vd_base, i, sew, val);
437        }
438    }
439    ext_state.mark_vs_dirty();
440    ext_state.reset_vstart();
441}
442
443/// Execute vrgather.vx / vrgather.vi: all active elements get `vs2[index]` or `0`.
444///
445/// # Safety
446/// - `vd` and `vs2` are validly aligned and non-overlapping (verified by caller).
447/// - `vl <= vlmax`.
448/// - When `vm=false`: `vd.bits() != 0`.
449#[inline(always)]
450#[expect(clippy::too_many_arguments, reason = "Internal API")]
451#[doc(hidden)]
452pub unsafe fn execute_rgather_scalar<Reg, ExtState, CustomError>(
453    ext_state: &mut ExtState,
454    vd: VReg,
455    vs2: VReg,
456    vm: bool,
457    vl: u32,
458    vstart: u32,
459    sew: Vsew,
460    vlmax: u32,
461    index: u64,
462) where
463    Reg: Register,
464    ExtState: VectorRegistersExt<Reg, CustomError>,
465    [(); ExtState::ELEN as usize]:,
466    [(); ExtState::VLEN as usize]:,
467    [(); ExtState::VLENB as usize]:,
468    CustomError: fmt::Debug,
469{
470    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
471    let mask_buf = unsafe { snapshot_mask(ext_state.read_vreg(), vm, vl) };
472    let vd_base = vd.bits();
473    let vs2_base = vs2.bits();
474    // Pre-compute the gathered value; it's the same for all elements.
475    let val = if index < u64::from(vlmax) {
476        // SAFETY: index < vlmax <= group_regs * elems_per_reg for vs2
477        unsafe {
478            read_element_u64(
479                ext_state.read_vreg(),
480                usize::from(vs2_base),
481                index as u32,
482                sew,
483            )
484        }
485    } else {
486        0u64
487    };
488    for i in vstart..vl {
489        if !mask_bit(&mask_buf, i) {
490            continue;
491        }
492        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
493        unsafe {
494            write_element_u64(ext_state.write_vreg(), vd_base, i, sew, val);
495        }
496    }
497    ext_state.mark_vs_dirty();
498    ext_state.reset_vstart();
499}
500
501/// Execute vrgatherei16.vv: `vd[i] = (vs1_16[i] < vlmax) ? vs2[vs1_16[i]] : 0`.
502///
503/// `vs1` always uses EEW=16 regardless of SEW. `vl` must not exceed the index register group
504/// capacity, i.e. `vl <= index_group_regs * VLENB / 2` (VLENB/2 = elems per register at EEW=16).
505///
506/// # Safety
507/// - `vd`, `vs2`, and `vs1` are validly aligned and mutually non-overlapping (verified by caller).
508/// - `vl <= vlmax` (for the data register group) AND `vl <= index_group_regs * VLENB / 2` (for the
509///   index register group).
510/// - When `vm=false`: `vd.bits() != 0`.
511#[inline(always)]
512#[expect(clippy::too_many_arguments, reason = "Internal API")]
513#[doc(hidden)]
514pub unsafe fn execute_rgatherei16<Reg, ExtState, CustomError>(
515    ext_state: &mut ExtState,
516    vd: VReg,
517    vs2: VReg,
518    vs1: VReg,
519    vm: bool,
520    vl: u32,
521    vstart: u32,
522    sew: Vsew,
523    vlmax: u32,
524    index_group_regs: u8,
525) where
526    Reg: Register,
527    ExtState: VectorRegistersExt<Reg, CustomError>,
528    [(); ExtState::ELEN as usize]:,
529    [(); ExtState::VLEN as usize]:,
530    [(); ExtState::VLENB as usize]:,
531    CustomError: fmt::Debug,
532{
533    // Maximum number of EEW=16 elements the index register group can hold.
534    // Each register holds VLENB / 2 elements at EEW=16.
535    let index_capacity = u32::from(index_group_regs) * (ExtState::VLENB / 2);
536    // `vl` must not exceed either the data VLMAX or the index register group capacity.
537    // Both bounds are guaranteed by the caller; this debug assertion catches misuse early.
538    debug_assert!(
539        vl <= vlmax && vl <= index_capacity,
540        "vl={vl} exceeds vlmax={vlmax} or index_capacity={index_capacity}"
541    );
542    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
543    let mask_buf = unsafe { snapshot_mask(ext_state.read_vreg(), vm, vl) };
544    let vd_base = vd.bits();
545    let vs2_base = vs2.bits();
546    let vs1_base = vs1.bits();
547    for i in vstart..vl {
548        if !mask_bit(&mask_buf, i) {
549            continue;
550        }
551        // Read 16-bit index from vs1; EEW=16 always.
552        // SAFETY: i < vl <= index_capacity = index_group_regs * (VLENB/2), so element i
553        // fits within the index register group.
554        let index =
555            unsafe { read_element_u64(ext_state.read_vreg(), usize::from(vs1_base), i, Vsew::E16) };
556        let val = if index < u64::from(vlmax) {
557            // SAFETY: index < vlmax <= group_regs * elems_per_reg for vs2
558            unsafe {
559                read_element_u64(
560                    ext_state.read_vreg(),
561                    usize::from(vs2_base),
562                    index as u32,
563                    sew,
564                )
565            }
566        } else {
567            0u64
568        };
569        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
570        unsafe {
571            write_element_u64(ext_state.write_vreg(), vd_base, i, sew, val);
572        }
573    }
574    ext_state.mark_vs_dirty();
575    ext_state.reset_vstart();
576}
577
578/// Execute vmerge.vvm / vmv.v.v.
579///
580/// When `vm=true` (vmv.v.v): all active elements `vstart..vl` get `vs1[i]`; vs2 unused.
581/// When `vm=false` (vmerge.vvm): active elements where `v0[i]=1` get `vs1[i]`,
582/// inactive elements get `vs2[i]`.
583///
584/// # Safety
585/// - `vd` and `vs1` are validly aligned (verified by caller).
586/// - When `vm=false`: `vs2` is validly aligned and `vd` does not overlap v0 (verified by caller).
587/// - `vl <= group_regs * VLENB / sew_bytes`.
588#[inline(always)]
589#[expect(clippy::too_many_arguments, reason = "Internal API")]
590#[doc(hidden)]
591pub unsafe fn execute_merge_vv<Reg, ExtState, CustomError>(
592    ext_state: &mut ExtState,
593    vd: VReg,
594    vs2: VReg,
595    vs1: VReg,
596    vm: bool,
597    vl: u32,
598    vstart: u32,
599    sew: Vsew,
600) where
601    Reg: Register,
602    ExtState: VectorRegistersExt<Reg, CustomError>,
603    [(); ExtState::ELEN as usize]:,
604    [(); ExtState::VLEN as usize]:,
605    [(); ExtState::VLENB as usize]:,
606    CustomError: fmt::Debug,
607{
608    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`.
609    // For vmv.v.v (vm=true) the mask is all-ones so snapshot_mask is still valid.
610    let mask_buf = unsafe { snapshot_mask(ext_state.read_vreg(), vm, vl) };
611    let vd_base = vd.bits();
612    let vs1_base = vs1.bits();
613    let vs2_base = vs2.bits();
614    for i in vstart..vl {
615        let mask_set = mask_bit(&mask_buf, i);
616        let val = if mask_set {
617            // SAFETY: i < vl <= group_regs * elems_per_reg for vs1
618            unsafe { read_element_u64(ext_state.read_vreg(), usize::from(vs1_base), i, sew) }
619        } else {
620            // mask_set=false only reachable when vm=false (vmerge path).
621            // SAFETY: i < vl <= group_regs * elems_per_reg for vs2
622            unsafe { read_element_u64(ext_state.read_vreg(), usize::from(vs2_base), i, sew) }
623        };
624        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
625        unsafe { write_element_u64(ext_state.write_vreg(), vd_base, i, sew, val) };
626    }
627    ext_state.mark_vs_dirty();
628    ext_state.reset_vstart();
629}
630
631/// Execute vmerge.vxm / vmerge.vim / vmv.v.x / vmv.v.i.
632///
633/// When `vm=true`: all active elements `vstart..vl` get `scalar`; vs2 unused.
634/// When `vm=false`: active elements where `v0[i]=1` get `scalar`,
635/// inactive elements get `vs2[i]`.
636///
637/// # Safety
638/// - `vd` is validly aligned (verified by caller).
639/// - When `vm=false`: `vs2` is validly aligned and `vd` does not overlap v0 (verified by caller).
640/// - `vl <= group_regs * VLENB / sew_bytes`.
641#[inline(always)]
642#[expect(clippy::too_many_arguments, reason = "Internal API")]
643#[doc(hidden)]
644pub unsafe fn execute_merge_scalar<Reg, ExtState, CustomError>(
645    ext_state: &mut ExtState,
646    vd: VReg,
647    vs2: VReg,
648    vm: bool,
649    vl: u32,
650    vstart: u32,
651    sew: Vsew,
652    scalar: u64,
653) where
654    Reg: Register,
655    ExtState: VectorRegistersExt<Reg, CustomError>,
656    [(); ExtState::ELEN as usize]:,
657    [(); ExtState::VLEN as usize]:,
658    [(); ExtState::VLENB as usize]:,
659    CustomError: fmt::Debug,
660{
661    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`.
662    let mask_buf = unsafe { snapshot_mask(ext_state.read_vreg(), vm, vl) };
663    let vd_base = vd.bits();
664    let vs2_base = vs2.bits();
665    for i in vstart..vl {
666        let val = if mask_bit(&mask_buf, i) {
667            scalar
668        } else {
669            // SAFETY: i < vl <= group_regs * elems_per_reg for vs2
670            unsafe { read_element_u64(ext_state.read_vreg(), usize::from(vs2_base), i, sew) }
671        };
672        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
673        unsafe { write_element_u64(ext_state.write_vreg(), vd_base, i, sew, val) };
674    }
675    ext_state.mark_vs_dirty();
676    ext_state.reset_vstart();
677}
678
679/// Execute vcompress.vm: pack active elements of vs2 (under vs1 mask) sequentially into vd.
680///
681/// `vs1` is treated as an explicit mask register (single register, not LMUL-grouped).
682/// The output write index increments only for elements where `vs1[i]` is set.
683/// vd must not overlap vs1 or vs2.
684///
685/// # Safety
686/// - `vd`, `vs2` are validly aligned and non-overlapping (verified by caller).
687/// - `vs1` does not overlap `vd` (verified by caller).
688/// - `vl <= VLMAX`.
689#[inline(always)]
690#[doc(hidden)]
691pub unsafe fn execute_compress<Reg, ExtState, CustomError>(
692    ext_state: &mut ExtState,
693    vd: VReg,
694    vs2: VReg,
695    vs1: VReg,
696    vl: u32,
697    sew: Vsew,
698) where
699    Reg: Register,
700    ExtState: VectorRegistersExt<Reg, CustomError>,
701    [(); ExtState::ELEN as usize]:,
702    [(); ExtState::VLEN as usize]:,
703    [(); ExtState::VLENB as usize]:,
704    CustomError: fmt::Debug,
705{
706    let vd_base = vd.bits();
707    let vs2_base = vs2.bits();
708    let vs1_base = vs1.bits();
709    let mask_bytes = vl.div_ceil(u8::BITS) as usize;
710    let vreg = ext_state.read_vreg();
711    let mut vs1_buf = [0u8; { ExtState::VLENB as usize }];
712    // SAFETY: mask_bytes <= VLENB since vl <= VLEN; vs1_base < 32
713    unsafe {
714        vs1_buf.get_unchecked_mut(..mask_bytes).copy_from_slice(
715            vreg.get_unchecked(usize::from(vs1_base))
716                .get_unchecked(..mask_bytes),
717        );
718    }
719    let mut out_idx = 0u32;
720    for i in 0..vl {
721        if !mask_bit(&vs1_buf, i) {
722            continue;
723        }
724        // SAFETY: i < vl <= group_regs * elems_per_reg
725        let val = unsafe { read_element_u64(ext_state.read_vreg(), usize::from(vs2_base), i, sew) };
726        // SAFETY: out_idx <= popcount(vs1[0..vl)) <= vl
727        unsafe {
728            write_element_u64(ext_state.write_vreg(), vd_base, out_idx, sew, val);
729        }
730        out_idx += 1;
731    }
732    ext_state.mark_vs_dirty();
733    ext_state.reset_vstart();
734}
735
736/// Copy `count` whole vector registers from `src_base` to `dst_base`.
737///
738/// No masking, no vtype dependency. Uses snapshot semantics: all source registers are read into
739/// a stack buffer before any destination registers are written, giving correct memmove-style
740/// behaviour for all overlap patterns (including partial overlap such as src=V0, dst=V1, count=2).
741///
742/// The stack allocation is at most 8 × VLENB bytes (`count <= 8` for vmv1r–vmv8r).
743///
744/// # Safety
745/// - `dst_base + count <= 32` and `src_base + count <= 32` (verified by caller via alignment
746///   checks).
747/// - `dst_base % count == 0` and `src_base % count == 0` (verified by caller).
748#[inline(always)]
749#[doc(hidden)]
750pub unsafe fn execute_whole_reg_move<const VLENB: usize>(
751    vreg: &mut [[u8; VLENB]; 32],
752    dst_base: u8,
753    src_base: u8,
754    count: u8,
755) {
756    let count = usize::from(count);
757    debug_assert!(count <= 8, "count must be <= 8 for vmvNr");
758    // Snapshot all source registers before writing any destination registers.
759    // This is correct for all overlap patterns without direction-dependent logic.
760    let mut tmp = [[0u8; VLENB]; 8];
761    for (k, item) in tmp.iter_mut().enumerate().take(count) {
762        let src_idx = usize::from(src_base) + k;
763        // SAFETY: src_idx < 32 per caller guarantee
764        *item = *unsafe { vreg.get_unchecked(src_idx) };
765    }
766    for (k, item) in tmp.iter().enumerate().take(count) {
767        let dst_idx = usize::from(dst_base) + k;
768        // SAFETY: dst_idx < 32 per caller guarantee
769        *unsafe { vreg.get_unchecked_mut(dst_idx) } = *item;
770    }
771}