Skip to main content

ab_riscv_interpreter/v/zvexx/perm/
zvexx_perm_helpers.rs

1//! Opaque helpers for ZveXx extension
2
3use crate::v::vector_registers::{VectorRegisterFile, VectorRegistersExt};
4pub use crate::v::zvexx::arith::zvexx_arith_helpers::check_vreg_group_alignment;
5use crate::v::zvexx::arith::zvexx_arith_helpers::{read_element_u64, write_element_u64};
6use crate::v::zvexx::load::zvexx_load_helpers::{mask_bit, snapshot_mask};
7use crate::v::zvexx::zvexx_helpers::INSTRUCTION_SIZE;
8use crate::{ExecutionError, ProgramCounter};
9use ab_riscv_primitives::prelude::*;
10use core::fmt;
11
12/// Check that register groups `[a, a+count)` and `[b, b+count)` do not overlap.
13///
14/// Both groups must have the same size `count`. For groups of different sizes use
15/// [`check_no_overlap_asymmetric`].
16#[inline(always)]
17#[doc(hidden)]
18pub fn check_no_overlap<Reg, Memory, PC, CustomError>(
19    program_counter: &PC,
20    a: VReg,
21    b: VReg,
22    count: u8,
23) -> Result<(), ExecutionError<Reg::Type, CustomError>>
24where
25    Reg: Register,
26    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
27{
28    let a_start = u16::from(a.to_bits());
29    let b_start = u16::from(b.to_bits());
30    let count = u16::from(count);
31    // Intervals [a_start, a_start+count) and [b_start, b_start+count) overlap iff
32    // each starts before the other ends. Arithmetic is widened to u16 to avoid u8 overflow
33    // (e.g., b_start=30 + count=8 = 38, which overflows u8).
34    if a_start < b_start + count && b_start < a_start + count {
35        return Err(ExecutionError::IllegalInstruction {
36            address: program_counter.old_pc(INSTRUCTION_SIZE),
37        });
38    }
39    Ok(())
40}
41
42/// Check that register group `[a, a+a_count)` does not overlap `[b, b+b_count)`.
43///
44/// Unlike [`check_no_overlap`], the two groups are allowed to have different sizes.
45/// Used for `vrgatherei16.vv` where vd/vs2 use LMUL-derived `group_regs` and vs1
46/// uses EEW=16-derived `index_group_regs`.
47#[inline(always)]
48#[doc(hidden)]
49pub fn check_no_overlap_asymmetric<Reg, Memory, PC, CustomError>(
50    program_counter: &PC,
51    a: VReg,
52    a_count: u8,
53    b: VReg,
54    b_count: u8,
55) -> Result<(), ExecutionError<Reg::Type, CustomError>>
56where
57    Reg: Register,
58    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
59{
60    let a_start = u16::from(a.to_bits());
61    let b_start = u16::from(b.to_bits());
62    let a_count = u16::from(a_count);
63    let b_count = u16::from(b_count);
64    // Intervals [a_start, a_start+a_count) and [b_start, b_start+b_count) overlap iff
65    // each starts before the other ends.
66    if a_start < b_start + b_count && b_start < a_start + a_count {
67        return Err(ExecutionError::IllegalInstruction {
68            address: program_counter.old_pc(INSTRUCTION_SIZE),
69        });
70    }
71    Ok(())
72}
73
74/// Read element 0 of register `base_reg` as `u64`, zero-extended.
75///
76/// # Safety
77/// `sew.bytes() <= VLENB`
78#[inline(always)]
79pub unsafe fn read_element_0_u64<const VLENB: usize>(
80    vregs: &VectorRegisterFile<VLENB>,
81    base_reg: VReg,
82    sew: Vsew,
83) -> u64 {
84    let sew_bytes = usize::from(sew.bytes_width());
85    let reg = vregs.get(base_reg);
86    let mut buf = [0u8; 8];
87    // SAFETY: `sew_bytes <= VLENB` for all legal vtype; `sew_bytes <= 8`
88    unsafe {
89        buf.get_unchecked_mut(..sew_bytes)
90            .copy_from_slice(reg.get_unchecked(..sew_bytes));
91    }
92    u64::from_le_bytes(buf)
93}
94
95/// Write element 0 of register `base_reg` from the low `sew_bytes` of `value`.
96///
97/// # Safety
98/// `sew.bytes() <= VLENB`
99#[inline(always)]
100pub unsafe fn write_element_0_u64<const VLENB: usize>(
101    vregs: &mut VectorRegisterFile<VLENB>,
102    base_reg: VReg,
103    sew: Vsew,
104    value: u64,
105) {
106    let sew_bytes = usize::from(sew.bytes_width());
107    let buf = value.to_le_bytes();
108    let reg = vregs.get_mut(base_reg);
109    // SAFETY: `sew_bytes <= VLENB`; `sew_bytes <= 8`
110    unsafe {
111        reg.get_unchecked_mut(..sew_bytes)
112            .copy_from_slice(buf.get_unchecked(..sew_bytes));
113    }
114}
115
116/// Sign-extend the low `sew.bits_width()` of `val` to the register type width.
117///
118/// The arithmetic is performed entirely in 64-bit signed integer space: we shift the SEW-wide
119/// value left to place its sign bit at bit 63, then arithmetic-right-shift back to propagate it.
120/// The resulting `u64` is then narrowed to `Reg::Type` (32 or 64 bits) by combining via
121/// `From<u32>` - the only integer conversion in the `Register::Type` trait bounds.
122///
123/// For RV32 (`Reg::XLEN == 32`) the low 32 bits are already the correct sign-extended result
124/// because the arithmetic shift propagates the sign across all 64 bits and then we discard the
125/// upper half.
126///
127/// For RV64 (`Reg::XLEN == 64`) we must preserve all 64 bits. Since `Reg::Type: From<u32>` and
128/// `Reg::Type: Shl<u8>`, we reconstruct the 64-bit value by OR-ing two 32-bit halves shifted
129/// into position.
130#[inline(always)]
131pub fn sign_extend_to_reg<Reg>(val: u64, sew: Vsew) -> Reg::Type
132where
133    Reg: Register,
134{
135    let sew_bits = u32::from(sew.bits_width());
136    // `shift` is in [0, 64). When sew_bits == 64, shift == 0 and the value is unchanged.
137    let shift = u64::BITS - sew_bits;
138    // Cast to i64 so the right-shift is arithmetic (sign-extending).
139    let sign_extended = (val.cast_signed() << shift) >> shift;
140    let raw = sign_extended.cast_unsigned();
141    if Reg::XLEN == u64::BITS as u8 {
142        // RV64: preserve all 64 bits by splitting into two u32 halves.
143        let lo = Reg::Type::from(raw as u32);
144        let hi = Reg::Type::from((raw >> u32::BITS) as u32);
145        lo | (hi << 32u8)
146    } else {
147        // RV32: the low 32 bits are the correctly truncated result.
148        Reg::Type::from(raw as u32)
149    }
150}
151
152/// Execute a vslideup operation.
153///
154/// Elements `vstart..min(offset, vl)` in vd are unchanged.
155/// Elements `max(vstart, offset)..vl` where mask is active get vs2[i - offset].
156///
157/// # Safety
158/// - `vd` and `vs2` are validly aligned and non-overlapping (verified by caller).
159/// - `vl <= group_regs * VLENB / sew_bytes`.
160/// - When `vm=false`: `vd.to_bits() != 0`.
161#[inline(always)]
162#[doc(hidden)]
163pub unsafe fn execute_slideup<Reg, ExtState, CustomError>(
164    ext_state: &mut ExtState,
165    vd: VReg,
166    vs2: VReg,
167    vm: bool,
168    sew: Vsew,
169    offset: u64,
170) where
171    Reg: Register,
172    ExtState: VectorRegistersExt<Reg, CustomError>,
173    [(); ExtState::ELEN as usize]:,
174    [(); ExtState::VLEN as usize]:,
175    [(); ExtState::VLENB as usize]:,
176    CustomError: fmt::Debug,
177{
178    let vl = ext_state.vl();
179    let vstart = ext_state.vstart();
180    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
181    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
182    // Per spec §16.3.1: elements 0..offset are never written (vd keeps its value).
183    // The active range starts at max(vstart, offset).
184    let start = u32::from(vstart).max(offset.min(u64::from(u32::MAX)) as u32);
185    for i in start..vl {
186        if !mask_bit(&mask_buf, i) {
187            continue;
188        }
189        let src_idx = u64::from(i) - offset;
190        // SAFETY: src_idx < vl <= group_regs * elems_per_reg, so source element is in range
191        let val = unsafe { read_element_u64(ext_state.read_vregs(), vs2, src_idx as u32, sew) };
192        // SAFETY: i < vl <= group_regs * elems_per_reg, so dest element is in range
193        unsafe {
194            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
195        }
196    }
197    ext_state.mark_vs_dirty();
198    ext_state.reset_vstart();
199}
200
201/// Execute a vslidedown operation.
202///
203/// Element `vd[i] = vs2[i + offset]` if `i + offset < vlmax`, else `0`.
204///
205/// # Safety
206/// - `vd` and `vs2` are validly aligned (verified by caller); overlap is permitted.
207/// - `vl <= vlmax`.
208/// - When `vm=false`: `vd.to_bits() != 0`.
209#[inline(always)]
210#[doc(hidden)]
211pub unsafe fn execute_slidedown<Reg, ExtState, CustomError>(
212    ext_state: &mut ExtState,
213    vd: VReg,
214    vs2: VReg,
215    vm: bool,
216    sew: Vsew,
217    vlmax: u32,
218    offset: u64,
219) where
220    Reg: Register,
221    ExtState: VectorRegistersExt<Reg, CustomError>,
222    [(); ExtState::ELEN as usize]:,
223    [(); ExtState::VLEN as usize]:,
224    [(); ExtState::VLENB as usize]:,
225    CustomError: fmt::Debug,
226{
227    let vl = ext_state.vl();
228    let vstart = ext_state.vstart();
229    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
230    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
231    for i in u32::from(vstart)..vl {
232        if !mask_bit(&mask_buf, i) {
233            continue;
234        }
235        // Use checked_add to guard against offset being so large that i + offset overflows u64.
236        // Any value that wraps past u64::MAX is trivially >= vlmax, so the spec requires vd[i]=0.
237        let val = if let Some(src_idx) = u64::from(i).checked_add(offset)
238            && src_idx < u64::from(vlmax)
239        {
240            // SAFETY: src_idx < vlmax <= group_regs * elems_per_reg, so element is in range
241            unsafe { read_element_u64(ext_state.read_vregs(), vs2, src_idx as u32, sew) }
242        } else {
243            0
244        };
245        // SAFETY: i < vl <= vlmax <= group_regs * elems_per_reg
246        unsafe {
247            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
248        }
249    }
250    ext_state.mark_vs_dirty();
251    ext_state.reset_vstart();
252}
253
254/// Execute a vslide1up operation.
255///
256/// Element 0 of vd gets `scalar` (when active and vl > 0).
257/// Element `i` for `1 <= i < vl` gets `vs2[i - 1]`.
258/// vd must not overlap vs2.
259///
260/// # Safety
261/// - `vd` and `vs2` are validly aligned and non-overlapping (verified by caller).
262/// - `vl <= group_regs * VLENB / sew_bytes`.
263/// - When `vm=false`: `vd.to_bits() != 0`.
264#[inline(always)]
265#[doc(hidden)]
266pub unsafe fn execute_slide1up<Reg, ExtState, CustomError>(
267    ext_state: &mut ExtState,
268    vd: VReg,
269    vs2: VReg,
270    vm: bool,
271    sew: Vsew,
272    scalar: u64,
273) where
274    Reg: Register,
275    ExtState: VectorRegistersExt<Reg, CustomError>,
276    [(); ExtState::ELEN as usize]:,
277    [(); ExtState::VLEN as usize]:,
278    [(); ExtState::VLENB as usize]:,
279    CustomError: fmt::Debug,
280{
281    let vl = ext_state.vl();
282    let vstart = ext_state.vstart();
283    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
284    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
285    for i in u32::from(vstart)..vl {
286        if !mask_bit(&mask_buf, i) {
287            continue;
288        }
289        let val = if i == 0 {
290            scalar
291        } else {
292            // SAFETY: i - 1 < vl <= group_regs * elems_per_reg
293            unsafe { read_element_u64(ext_state.read_vregs(), vs2, i - 1, sew) }
294        };
295        // SAFETY: i < vl <= group_regs * elems_per_reg
296        unsafe {
297            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
298        }
299    }
300    ext_state.mark_vs_dirty();
301    ext_state.reset_vstart();
302}
303
304/// Execute a vslide1down operation.
305///
306/// Element `vd[i] = vs2[i + 1]` for `i < vl - 1`; element `vd[vl - 1]` gets `scalar`.
307///
308/// Overlap between `vd` and `vs2` is permitted by the spec. When they share the same register
309/// group base (exact overlap), ascending iteration is still correct: each write goes to byte range
310/// `[i*sew, (i+1)*sew)` while the subsequent read comes from `[(i+1)*sew, (i+2)*sew)`. These
311/// ranges are adjacent and non-overlapping, so writing element `i` never corrupts the source bytes
312/// of element `i+1`.
313///
314/// # Safety
315/// - `vd` and `vs2` are validly aligned (verified by caller); overlap is permitted.
316/// - `vl <= group_regs * VLENB / sew_bytes`.
317/// - When `vm=false`: `vd.to_bits() != 0`.
318#[inline(always)]
319#[doc(hidden)]
320pub unsafe fn execute_slide1down<Reg, ExtState, CustomError>(
321    ext_state: &mut ExtState,
322    vd: VReg,
323    vs2: VReg,
324    vm: bool,
325    sew: Vsew,
326    scalar: u64,
327) where
328    Reg: Register,
329    ExtState: VectorRegistersExt<Reg, CustomError>,
330    [(); ExtState::ELEN as usize]:,
331    [(); ExtState::VLEN as usize]:,
332    [(); ExtState::VLENB as usize]:,
333    CustomError: fmt::Debug,
334{
335    let vl = ext_state.vl();
336    let vstart = ext_state.vstart();
337    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
338    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
339    for i in u32::from(vstart)..vl {
340        if !mask_bit(&mask_buf, i) {
341            continue;
342        }
343        let val = if i + 1 < vl {
344            // SAFETY: i + 1 < vl <= group_regs * elems_per_reg
345            unsafe { read_element_u64(ext_state.read_vregs(), vs2, i + 1, sew) }
346        } else {
347            scalar
348        };
349        // SAFETY: i < vl <= group_regs * elems_per_reg
350        unsafe {
351            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
352        }
353    }
354    ext_state.mark_vs_dirty();
355    ext_state.reset_vstart();
356}
357
358/// Execute vrgather.vv: `vd[i] = (vs1[i] < vlmax) ? vs2[vs1[i]] : 0`.
359///
360/// # Safety
361/// - `vd`, `vs2`, and `vs1` are validly aligned and mutually non-overlapping (verified by caller).
362/// - `vl <= vlmax`.
363/// - When `vm=false`: `vd.to_bits() != 0`.
364#[inline(always)]
365#[doc(hidden)]
366pub unsafe fn execute_rgather_vv<Reg, ExtState, CustomError>(
367    ext_state: &mut ExtState,
368    vd: VReg,
369    vs2: VReg,
370    vs1: VReg,
371    vm: bool,
372    sew: Vsew,
373    vlmax: u32,
374) where
375    Reg: Register,
376    ExtState: VectorRegistersExt<Reg, CustomError>,
377    [(); ExtState::ELEN as usize]:,
378    [(); ExtState::VLEN as usize]:,
379    [(); ExtState::VLENB as usize]:,
380    CustomError: fmt::Debug,
381{
382    let vl = ext_state.vl();
383    let vstart = ext_state.vstart();
384    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
385    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
386    for i in u32::from(vstart)..vl {
387        if !mask_bit(&mask_buf, i) {
388            continue;
389        }
390        // SAFETY: i < vl <= group_regs * elems_per_reg for vs1
391        let index = unsafe { read_element_u64(ext_state.read_vregs(), vs1, i, sew) };
392        let val = if index < u64::from(vlmax) {
393            // SAFETY: index < vlmax <= group_regs * elems_per_reg for vs2
394            unsafe { read_element_u64(ext_state.read_vregs(), vs2, index as u32, sew) }
395        } else {
396            0u64
397        };
398        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
399        unsafe {
400            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
401        }
402    }
403    ext_state.mark_vs_dirty();
404    ext_state.reset_vstart();
405}
406
407/// Execute vrgather.vx / vrgather.vi: all active elements get `vs2[index]` or `0`.
408///
409/// # Safety
410/// - `vd` and `vs2` are validly aligned and non-overlapping (verified by caller).
411/// - `vl <= vlmax`.
412/// - When `vm=false`: `vd.to_bits() != 0`.
413#[inline(always)]
414#[doc(hidden)]
415pub unsafe fn execute_rgather_scalar<Reg, ExtState, CustomError>(
416    ext_state: &mut ExtState,
417    vd: VReg,
418    vs2: VReg,
419    vm: bool,
420    sew: Vsew,
421    vlmax: u32,
422    index: u64,
423) where
424    Reg: Register,
425    ExtState: VectorRegistersExt<Reg, CustomError>,
426    [(); ExtState::ELEN as usize]:,
427    [(); ExtState::VLEN as usize]:,
428    [(); ExtState::VLENB as usize]:,
429    CustomError: fmt::Debug,
430{
431    let vl = ext_state.vl();
432    let vstart = ext_state.vstart();
433    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
434    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
435    // Pre-compute the gathered value; it's the same for all elements.
436    let val = if index < u64::from(vlmax) {
437        // SAFETY: index < vlmax <= group_regs * elems_per_reg for vs2
438        unsafe { read_element_u64(ext_state.read_vregs(), vs2, index as u32, sew) }
439    } else {
440        0u64
441    };
442    for i in u32::from(vstart)..vl {
443        if !mask_bit(&mask_buf, i) {
444            continue;
445        }
446        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
447        unsafe {
448            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
449        }
450    }
451    ext_state.mark_vs_dirty();
452    ext_state.reset_vstart();
453}
454
455/// Execute vrgatherei16.vv: `vd[i] = (vs1_16[i] < vlmax) ? vs2[vs1_16[i]] : 0`.
456///
457/// `vs1` always uses EEW=16 regardless of SEW. `vl` must not exceed the index register group
458/// capacity, i.e. `vl <= index_group_regs * VLENB / 2` (VLENB/2 = elems per register at EEW=16).
459///
460/// # Safety
461/// - `vd`, `vs2`, and `vs1` are validly aligned and mutually non-overlapping (verified by caller).
462/// - `vl <= vlmax` (for the data register group) AND `vl <= index_group_regs * VLENB / 2` (for the
463///   index register group).
464/// - When `vm=false`: `vd.to_bits() != 0`.
465#[inline(always)]
466#[expect(clippy::too_many_arguments, reason = "Internal API")]
467#[doc(hidden)]
468pub unsafe fn execute_rgatherei16<Reg, ExtState, CustomError>(
469    ext_state: &mut ExtState,
470    vd: VReg,
471    vs2: VReg,
472    vs1: VReg,
473    vm: bool,
474    sew: Vsew,
475    vlmax: u32,
476    index_group_regs: u8,
477) where
478    Reg: Register,
479    ExtState: VectorRegistersExt<Reg, CustomError>,
480    [(); ExtState::ELEN as usize]:,
481    [(); ExtState::VLEN as usize]:,
482    [(); ExtState::VLENB as usize]:,
483    CustomError: fmt::Debug,
484{
485    let vl = ext_state.vl();
486    let vstart = ext_state.vstart();
487    // Maximum number of EEW=16 elements the index register group can hold.
488    // Each register holds VLENB / 2 elements at EEW=16.
489    let index_capacity = u32::from(index_group_regs) * (ExtState::VLENB / 2);
490    // `vl` must not exceed either the data VLMAX or the index register group capacity.
491    // Both bounds are guaranteed by the caller; this debug assertion catches misuse early.
492    debug_assert!(
493        vl <= vlmax && vl <= index_capacity,
494        "vl={vl} exceeds vlmax={vlmax} or index_capacity={index_capacity}"
495    );
496    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
497    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
498    for i in u32::from(vstart)..vl {
499        if !mask_bit(&mask_buf, i) {
500            continue;
501        }
502        // Read 16-bit index from vs1; EEW=16 always.
503        // SAFETY: i < vl <= index_capacity = index_group_regs * (VLENB/2), so element i
504        // fits within the index register group.
505        let index = unsafe { read_element_u64(ext_state.read_vregs(), vs1, i, Vsew::E16) };
506        let val = if index < u64::from(vlmax) {
507            // SAFETY: index < vlmax <= group_regs * elems_per_reg for vs2
508            unsafe { read_element_u64(ext_state.read_vregs(), vs2, index as u32, sew) }
509        } else {
510            0u64
511        };
512        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
513        unsafe {
514            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
515        }
516    }
517    ext_state.mark_vs_dirty();
518    ext_state.reset_vstart();
519}
520
521/// Execute vmerge.vvm / vmv.v.v.
522///
523/// When `vm=true` (vmv.v.v): all active elements `vstart..vl` get `vs1[i]`; vs2 unused.
524/// When `vm=false` (vmerge.vvm): active elements where `v0[i]=1` get `vs1[i]`,
525/// inactive elements get `vs2[i]`.
526///
527/// # Safety
528/// - `vd` and `vs1` are validly aligned (verified by caller).
529/// - When `vm=false`: `vs2` is validly aligned and `vd` does not overlap v0 (verified by caller).
530/// - `vl <= group_regs * VLENB / sew_bytes`.
531#[inline(always)]
532#[doc(hidden)]
533pub unsafe fn execute_merge_vv<Reg, ExtState, CustomError>(
534    ext_state: &mut ExtState,
535    vd: VReg,
536    vs2: VReg,
537    vs1: VReg,
538    vm: bool,
539    sew: Vsew,
540) where
541    Reg: Register,
542    ExtState: VectorRegistersExt<Reg, CustomError>,
543    [(); ExtState::ELEN as usize]:,
544    [(); ExtState::VLEN as usize]:,
545    [(); ExtState::VLENB as usize]:,
546    CustomError: fmt::Debug,
547{
548    let vl = ext_state.vl();
549    let vstart = ext_state.vstart();
550    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`.
551    // For vmv.v.v (vm=true) the mask is all-ones so snapshot_mask is still valid.
552    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
553    for i in u32::from(vstart)..vl {
554        let mask_set = mask_bit(&mask_buf, i);
555        let val = if mask_set {
556            // SAFETY: i < vl <= group_regs * elems_per_reg for vs1
557            unsafe { read_element_u64(ext_state.read_vregs(), vs1, i, sew) }
558        } else {
559            // mask_set=false only reachable when vm=false (vmerge path).
560            // SAFETY: i < vl <= group_regs * elems_per_reg for vs2
561            unsafe { read_element_u64(ext_state.read_vregs(), vs2, i, sew) }
562        };
563        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
564        unsafe {
565            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
566        }
567    }
568    ext_state.mark_vs_dirty();
569    ext_state.reset_vstart();
570}
571
572/// Execute vmerge.vxm / vmerge.vim / vmv.v.x / vmv.v.i.
573///
574/// When `vm=true`: all active elements `vstart..vl` get `scalar`; vs2 unused.
575/// When `vm=false`: active elements where `v0[i]=1` get `scalar`,
576/// inactive elements get `vs2[i]`.
577///
578/// # Safety
579/// - `vd` is validly aligned (verified by caller).
580/// - When `vm=false`: `vs2` is validly aligned and `vd` does not overlap v0 (verified by caller).
581/// - `vl <= group_regs * VLENB / sew_bytes`.
582#[inline(always)]
583#[doc(hidden)]
584pub unsafe fn execute_merge_scalar<Reg, ExtState, CustomError>(
585    ext_state: &mut ExtState,
586    vd: VReg,
587    vs2: VReg,
588    vm: bool,
589    sew: Vsew,
590    scalar: u64,
591) where
592    Reg: Register,
593    ExtState: VectorRegistersExt<Reg, CustomError>,
594    [(); ExtState::ELEN as usize]:,
595    [(); ExtState::VLEN as usize]:,
596    [(); ExtState::VLENB as usize]:,
597    CustomError: fmt::Debug,
598{
599    let vl = ext_state.vl();
600    let vstart = ext_state.vstart();
601    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`.
602    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
603
604    for i in u32::from(vstart)..vl {
605        let val = if mask_bit(&mask_buf, i) {
606            scalar
607        } else {
608            // SAFETY: i < vl <= group_regs * elems_per_reg for vs2
609            unsafe { read_element_u64(ext_state.read_vregs(), vs2, i, sew) }
610        };
611        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
612        unsafe {
613            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
614        }
615    }
616    ext_state.mark_vs_dirty();
617    ext_state.reset_vstart();
618}
619
620/// Execute vcompress.vm: pack active elements of vs2 (under vs1 mask) sequentially into vd.
621///
622/// `vs1` is treated as an explicit mask register (single register, not LMUL-grouped).
623/// The output write index increments only for elements where `vs1[i]` is set.
624/// vd must not overlap vs1 or vs2.
625///
626/// # Safety
627/// - `vd`, `vs2` are validly aligned and non-overlapping (verified by caller).
628/// - `vs1` does not overlap `vd` (verified by caller).
629/// - `vl <= VLMAX`.
630#[inline(always)]
631#[doc(hidden)]
632pub unsafe fn execute_compress<Reg, ExtState, CustomError>(
633    ext_state: &mut ExtState,
634    vd: VReg,
635    vs2: VReg,
636    vs1: VReg,
637    vl: u32,
638    sew: Vsew,
639) where
640    Reg: Register,
641    ExtState: VectorRegistersExt<Reg, CustomError>,
642    [(); ExtState::ELEN as usize]:,
643    [(); ExtState::VLEN as usize]:,
644    [(); ExtState::VLENB as usize]:,
645    CustomError: fmt::Debug,
646{
647    let mask_bytes = vl.div_ceil(u8::BITS) as usize;
648    let vreg = ext_state.read_vregs();
649    let mut vs1_buf = [0u8; { ExtState::VLENB as usize }];
650    // SAFETY: mask_bytes <= VLENB since vl <= VLEN; vs1_base < 32
651    unsafe {
652        vs1_buf
653            .get_unchecked_mut(..mask_bytes)
654            .copy_from_slice(vreg.get(vs1).get_unchecked(..mask_bytes));
655    }
656    let mut out_idx = 0u32;
657    for i in 0..vl {
658        if !mask_bit(&vs1_buf, i) {
659            continue;
660        }
661        // SAFETY: i < vl <= group_regs * elems_per_reg
662        let val = unsafe { read_element_u64(ext_state.read_vregs(), vs2, i, sew) };
663        // SAFETY: out_idx <= popcount(vs1[0..vl)) <= vl
664        unsafe {
665            write_element_u64(ext_state.write_vregs(), vd, out_idx, sew, val);
666        }
667        out_idx += 1;
668    }
669    ext_state.mark_vs_dirty();
670    ext_state.reset_vstart();
671}
672
673/// Copy `count` whole vector registers from `src_base` to `dst_base`.
674///
675/// No masking, no vtype dependency. Uses snapshot semantics: all source registers are read into
676/// a stack buffer before any destination registers are written, giving correct memmove-style
677/// behaviour for all overlap patterns (including partial overlap such as src=V0, dst=V1, count=2).
678///
679/// The stack allocation is at most 8 × VLENB bytes (`count <= 8` for vmv1r–vmv8r).
680///
681/// # Safety
682/// - `dst_base + count <= 32` and `src_base + count <= 32` (verified by caller via alignment
683///   checks).
684/// - `dst_base % count == 0` and `src_base % count == 0` (verified by caller).
685#[inline(always)]
686#[doc(hidden)]
687pub unsafe fn execute_whole_reg_move<const VLENB: usize>(
688    vregs: &mut VectorRegisterFile<VLENB>,
689    dst_base: VReg,
690    src_base: VReg,
691    count: u8,
692) {
693    let count = usize::from(count);
694    debug_assert!(count <= 8, "count must be <= 8 for vmvNr");
695    // Snapshot all source registers before writing any destination registers.
696    // This is correct for all overlap patterns without direction-dependent logic.
697    let mut tmp = [[0u8; VLENB]; 8];
698    for (k, item) in tmp.iter_mut().enumerate().take(count) {
699        // SAFETY: Guaranteed by function contract
700        let src = unsafe { VReg::from_bits(src_base.to_bits() + k as u8).unwrap_unchecked() };
701        *item = *vregs.get(src);
702    }
703    for (k, item) in tmp.iter().enumerate().take(count) {
704        // SAFETY: Guaranteed by function contract
705        let dst = unsafe { VReg::from_bits(dst_base.to_bits() + k as u8).unwrap_unchecked() };
706        *vregs.get_mut(dst) = *item;
707    }
708}