Skip to main content

ab_riscv_interpreter/v/zvexx/perm/
zvexx_perm_helpers.rs

1//! Opaque helpers for ZveXx extension
2
3use crate::v::vector_registers::{VectorRegisterFile, VectorRegistersExt};
4pub use crate::v::zvexx::arith::zvexx_arith_helpers::check_vreg_group_alignment;
5use crate::v::zvexx::arith::zvexx_arith_helpers::{read_element_u64, write_element_u64};
6use crate::v::zvexx::load::zvexx_load_helpers::{mask_bit, snapshot_mask};
7use crate::v::zvexx::zvexx_helpers::INSTRUCTION_SIZE;
8use crate::{ExecutionError, ProgramCounter};
9use ab_riscv_primitives::prelude::*;
10use core::fmt;
11use core::hint::cold_path;
12
13/// Check that register groups `[a, a+count)` and `[b, b+count)` do not overlap.
14///
15/// Both groups must have the same size `count`. For groups of different sizes use
16/// [`check_no_overlap_asymmetric`].
17#[inline(always)]
18#[doc(hidden)]
19pub fn check_no_overlap<Reg, Memory, PC, CustomError>(
20    program_counter: &PC,
21    a: VReg,
22    b: VReg,
23    count: u8,
24) -> Result<(), ExecutionError<Reg::Type, CustomError>>
25where
26    Reg: Register,
27    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
28{
29    let a_start = u16::from(a.to_bits());
30    let b_start = u16::from(b.to_bits());
31    let count = u16::from(count);
32    // Intervals [a_start, a_start+count) and [b_start, b_start+count) overlap iff
33    // each starts before the other ends. Arithmetic is widened to u16 to avoid u8 overflow
34    // (e.g., b_start=30 + count=8 = 38, which overflows u8).
35    if a_start < b_start + count && b_start < a_start + count {
36        cold_path();
37        return Err(ExecutionError::IllegalInstruction {
38            address: program_counter.old_pc(INSTRUCTION_SIZE),
39        });
40    }
41    Ok(())
42}
43
44/// Check that register group `[a, a+a_count)` does not overlap `[b, b+b_count)`.
45///
46/// Unlike [`check_no_overlap`], the two groups are allowed to have different sizes.
47/// Used for `vrgatherei16.vv` where vd/vs2 use LMUL-derived `group_regs` and vs1
48/// uses EEW=16-derived `index_group_regs`.
49#[inline(always)]
50#[doc(hidden)]
51pub fn check_no_overlap_asymmetric<Reg, Memory, PC, CustomError>(
52    program_counter: &PC,
53    a: VReg,
54    a_count: u8,
55    b: VReg,
56    b_count: u8,
57) -> Result<(), ExecutionError<Reg::Type, CustomError>>
58where
59    Reg: Register,
60    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
61{
62    let a_start = u16::from(a.to_bits());
63    let b_start = u16::from(b.to_bits());
64    let a_count = u16::from(a_count);
65    let b_count = u16::from(b_count);
66    // Intervals [a_start, a_start+a_count) and [b_start, b_start+b_count) overlap iff
67    // each starts before the other ends.
68    if a_start < b_start + b_count && b_start < a_start + a_count {
69        cold_path();
70        return Err(ExecutionError::IllegalInstruction {
71            address: program_counter.old_pc(INSTRUCTION_SIZE),
72        });
73    }
74    Ok(())
75}
76
77/// Read element 0 of register `base_reg` as `u64`, zero-extended.
78///
79/// # Safety
80/// `sew.bytes() <= VLENB`
81#[inline(always)]
82pub unsafe fn read_element_0_u64<const VLENB: usize>(
83    vregs: &VectorRegisterFile<VLENB>,
84    base_reg: VReg,
85    sew: Vsew,
86) -> u64 {
87    let sew_bytes = usize::from(sew.bytes_width());
88    let reg = vregs.get(base_reg);
89    let mut buf = [0u8; 8];
90    // SAFETY: `sew_bytes <= VLENB` for all legal vtype; `sew_bytes <= 8`
91    unsafe {
92        buf.get_unchecked_mut(..sew_bytes)
93            .copy_from_slice(reg.get_unchecked(..sew_bytes));
94    }
95    u64::from_le_bytes(buf)
96}
97
98/// Write element 0 of register `base_reg` from the low `sew_bytes` of `value`.
99///
100/// # Safety
101/// `sew.bytes() <= VLENB`
102#[inline(always)]
103pub unsafe fn write_element_0_u64<const VLENB: usize>(
104    vregs: &mut VectorRegisterFile<VLENB>,
105    base_reg: VReg,
106    sew: Vsew,
107    value: u64,
108) {
109    let sew_bytes = usize::from(sew.bytes_width());
110    let buf = value.to_le_bytes();
111    let reg = vregs.get_mut(base_reg);
112    // SAFETY: `sew_bytes <= VLENB`; `sew_bytes <= 8`
113    unsafe {
114        reg.get_unchecked_mut(..sew_bytes)
115            .copy_from_slice(buf.get_unchecked(..sew_bytes));
116    }
117}
118
119/// Sign-extend the low `sew.bits_width()` of `val` to the register type width.
120///
121/// The arithmetic is performed entirely in 64-bit signed integer space: we shift the SEW-wide
122/// value left to place its sign bit at bit 63, then arithmetic-right-shift back to propagate it.
123/// The resulting `u64` is then narrowed to `Reg::Type` (32 or 64 bits) by combining via
124/// `From<u32>` - the only integer conversion in the `Register::Type` trait bounds.
125///
126/// For RV32 (`Reg::XLEN == 32`) the low 32 bits are already the correct sign-extended result
127/// because the arithmetic shift propagates the sign across all 64 bits and then we discard the
128/// upper half.
129///
130/// For RV64 (`Reg::XLEN == 64`) we must preserve all 64 bits. Since `Reg::Type: From<u32>` and
131/// `Reg::Type: Shl<u8>`, we reconstruct the 64-bit value by OR-ing two 32-bit halves shifted
132/// into position.
133#[inline(always)]
134pub fn sign_extend_to_reg<Reg>(val: u64, sew: Vsew) -> Reg::Type
135where
136    Reg: Register,
137{
138    let sew_bits = u32::from(sew.bits_width());
139    // `shift` is in [0, 64). When sew_bits == 64, shift == 0 and the value is unchanged.
140    let shift = u64::BITS - sew_bits;
141    // Cast to i64 so the right-shift is arithmetic (sign-extending).
142    let sign_extended = (val.cast_signed() << shift) >> shift;
143    let raw = sign_extended.cast_unsigned();
144    if Reg::XLEN == u64::BITS as u8 {
145        // RV64: preserve all 64 bits by splitting into two u32 halves.
146        let lo = Reg::Type::from(raw as u32);
147        let hi = Reg::Type::from((raw >> u32::BITS) as u32);
148        lo | (hi << 32u8)
149    } else {
150        // RV32: the low 32 bits are the correctly truncated result.
151        Reg::Type::from(raw as u32)
152    }
153}
154
155/// Execute a vslideup operation.
156///
157/// Elements `vstart..min(offset, vl)` in vd are unchanged.
158/// Elements `max(vstart, offset)..vl` where mask is active get vs2[i - offset].
159///
160/// # Safety
161/// - `vd` and `vs2` are validly aligned and non-overlapping (verified by caller).
162/// - `vl <= group_regs * VLENB / sew_bytes`.
163/// - When `vm=false`: `vd.to_bits() != 0`.
164#[inline(always)]
165#[doc(hidden)]
166pub unsafe fn execute_slideup<Reg, ExtState, CustomError>(
167    ext_state: &mut ExtState,
168    vd: VReg,
169    vs2: VReg,
170    vm: bool,
171    sew: Vsew,
172    offset: u64,
173) where
174    Reg: Register,
175    ExtState: VectorRegistersExt<Reg, CustomError>,
176    [(); ExtState::ELEN as usize]:,
177    [(); ExtState::VLEN as usize]:,
178    [(); ExtState::VLENB as usize]:,
179    CustomError: fmt::Debug,
180{
181    let vl = ext_state.vl();
182    let vstart = ext_state.vstart();
183    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
184    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
185    // Per spec ยง16.3.1: elements 0..offset are never written (vd keeps its value).
186    // The active range starts at max(vstart, offset).
187    let start = u32::from(vstart).max(offset.min(u64::from(u32::MAX)) as u32);
188    for i in start..vl {
189        if !mask_bit(&mask_buf, i) {
190            continue;
191        }
192        let src_idx = u64::from(i) - offset;
193        // SAFETY: src_idx < vl <= group_regs * elems_per_reg, so source element is in range
194        let val = unsafe { read_element_u64(ext_state.read_vregs(), vs2, src_idx as u32, sew) };
195        // SAFETY: i < vl <= group_regs * elems_per_reg, so dest element is in range
196        unsafe {
197            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
198        }
199    }
200    ext_state.mark_vs_dirty();
201    ext_state.reset_vstart();
202}
203
204/// Execute a vslidedown operation.
205///
206/// Element `vd[i] = vs2[i + offset]` if `i + offset < vlmax`, else `0`.
207///
208/// # Safety
209/// - `vd` and `vs2` are validly aligned (verified by caller); overlap is permitted.
210/// - `vl <= vlmax`.
211/// - When `vm=false`: `vd.to_bits() != 0`.
212#[inline(always)]
213#[doc(hidden)]
214pub unsafe fn execute_slidedown<Reg, ExtState, CustomError>(
215    ext_state: &mut ExtState,
216    vd: VReg,
217    vs2: VReg,
218    vm: bool,
219    sew: Vsew,
220    vlmax: u32,
221    offset: u64,
222) where
223    Reg: Register,
224    ExtState: VectorRegistersExt<Reg, CustomError>,
225    [(); ExtState::ELEN as usize]:,
226    [(); ExtState::VLEN as usize]:,
227    [(); ExtState::VLENB as usize]:,
228    CustomError: fmt::Debug,
229{
230    let vl = ext_state.vl();
231    let vstart = ext_state.vstart();
232    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
233    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
234    for i in u32::from(vstart)..vl {
235        if !mask_bit(&mask_buf, i) {
236            continue;
237        }
238        // Use checked_add to guard against offset being so large that i + offset overflows u64.
239        // Any value that wraps past u64::MAX is trivially >= vlmax, so the spec requires vd[i]=0.
240        let val = if let Some(src_idx) = u64::from(i).checked_add(offset)
241            && src_idx < u64::from(vlmax)
242        {
243            // SAFETY: src_idx < vlmax <= group_regs * elems_per_reg, so element is in range
244            unsafe { read_element_u64(ext_state.read_vregs(), vs2, src_idx as u32, sew) }
245        } else {
246            0
247        };
248        // SAFETY: i < vl <= vlmax <= group_regs * elems_per_reg
249        unsafe {
250            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
251        }
252    }
253    ext_state.mark_vs_dirty();
254    ext_state.reset_vstart();
255}
256
257/// Execute a vslide1up operation.
258///
259/// Element 0 of vd gets `scalar` (when active and vl > 0).
260/// Element `i` for `1 <= i < vl` gets `vs2[i - 1]`.
261/// vd must not overlap vs2.
262///
263/// # Safety
264/// - `vd` and `vs2` are validly aligned and non-overlapping (verified by caller).
265/// - `vl <= group_regs * VLENB / sew_bytes`.
266/// - When `vm=false`: `vd.to_bits() != 0`.
267#[inline(always)]
268#[doc(hidden)]
269pub unsafe fn execute_slide1up<Reg, ExtState, CustomError>(
270    ext_state: &mut ExtState,
271    vd: VReg,
272    vs2: VReg,
273    vm: bool,
274    sew: Vsew,
275    scalar: u64,
276) where
277    Reg: Register,
278    ExtState: VectorRegistersExt<Reg, CustomError>,
279    [(); ExtState::ELEN as usize]:,
280    [(); ExtState::VLEN as usize]:,
281    [(); ExtState::VLENB as usize]:,
282    CustomError: fmt::Debug,
283{
284    let vl = ext_state.vl();
285    let vstart = ext_state.vstart();
286    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
287    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
288    for i in u32::from(vstart)..vl {
289        if !mask_bit(&mask_buf, i) {
290            continue;
291        }
292        let val = if i == 0 {
293            scalar
294        } else {
295            // SAFETY: i - 1 < vl <= group_regs * elems_per_reg
296            unsafe { read_element_u64(ext_state.read_vregs(), vs2, i - 1, sew) }
297        };
298        // SAFETY: i < vl <= group_regs * elems_per_reg
299        unsafe {
300            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
301        }
302    }
303    ext_state.mark_vs_dirty();
304    ext_state.reset_vstart();
305}
306
307/// Execute a vslide1down operation.
308///
309/// Element `vd[i] = vs2[i + 1]` for `i < vl - 1`; element `vd[vl - 1]` gets `scalar`.
310///
311/// Overlap between `vd` and `vs2` is permitted by the spec. When they share the same register
312/// group base (exact overlap), ascending iteration is still correct: each write goes to byte range
313/// `[i*sew, (i+1)*sew)` while the subsequent read comes from `[(i+1)*sew, (i+2)*sew)`. These
314/// ranges are adjacent and non-overlapping, so writing element `i` never corrupts the source bytes
315/// of element `i+1`.
316///
317/// # Safety
318/// - `vd` and `vs2` are validly aligned (verified by caller); overlap is permitted.
319/// - `vl <= group_regs * VLENB / sew_bytes`.
320/// - When `vm=false`: `vd.to_bits() != 0`.
321#[inline(always)]
322#[doc(hidden)]
323pub unsafe fn execute_slide1down<Reg, ExtState, CustomError>(
324    ext_state: &mut ExtState,
325    vd: VReg,
326    vs2: VReg,
327    vm: bool,
328    sew: Vsew,
329    scalar: u64,
330) where
331    Reg: Register,
332    ExtState: VectorRegistersExt<Reg, CustomError>,
333    [(); ExtState::ELEN as usize]:,
334    [(); ExtState::VLEN as usize]:,
335    [(); ExtState::VLENB as usize]:,
336    CustomError: fmt::Debug,
337{
338    let vl = ext_state.vl();
339    let vstart = ext_state.vstart();
340    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
341    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
342    for i in u32::from(vstart)..vl {
343        if !mask_bit(&mask_buf, i) {
344            continue;
345        }
346        let val = if i + 1 < vl {
347            // SAFETY: i + 1 < vl <= group_regs * elems_per_reg
348            unsafe { read_element_u64(ext_state.read_vregs(), vs2, i + 1, sew) }
349        } else {
350            scalar
351        };
352        // SAFETY: i < vl <= group_regs * elems_per_reg
353        unsafe {
354            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
355        }
356    }
357    ext_state.mark_vs_dirty();
358    ext_state.reset_vstart();
359}
360
361/// Execute vrgather.vv: `vd[i] = (vs1[i] < vlmax) ? vs2[vs1[i]] : 0`.
362///
363/// # Safety
364/// - `vd`, `vs2`, and `vs1` are validly aligned and mutually non-overlapping (verified by caller).
365/// - `vl <= vlmax`.
366/// - When `vm=false`: `vd.to_bits() != 0`.
367#[inline(always)]
368#[doc(hidden)]
369pub unsafe fn execute_rgather_vv<Reg, ExtState, CustomError>(
370    ext_state: &mut ExtState,
371    vd: VReg,
372    vs2: VReg,
373    vs1: VReg,
374    vm: bool,
375    sew: Vsew,
376    vlmax: u32,
377) where
378    Reg: Register,
379    ExtState: VectorRegistersExt<Reg, CustomError>,
380    [(); ExtState::ELEN as usize]:,
381    [(); ExtState::VLEN as usize]:,
382    [(); ExtState::VLENB as usize]:,
383    CustomError: fmt::Debug,
384{
385    let vl = ext_state.vl();
386    let vstart = ext_state.vstart();
387    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
388    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
389    for i in u32::from(vstart)..vl {
390        if !mask_bit(&mask_buf, i) {
391            continue;
392        }
393        // SAFETY: i < vl <= group_regs * elems_per_reg for vs1
394        let index = unsafe { read_element_u64(ext_state.read_vregs(), vs1, i, sew) };
395        let val = if index < u64::from(vlmax) {
396            // SAFETY: index < vlmax <= group_regs * elems_per_reg for vs2
397            unsafe { read_element_u64(ext_state.read_vregs(), vs2, index as u32, sew) }
398        } else {
399            0u64
400        };
401        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
402        unsafe {
403            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
404        }
405    }
406    ext_state.mark_vs_dirty();
407    ext_state.reset_vstart();
408}
409
410/// Execute vrgather.vx / vrgather.vi: all active elements get `vs2[index]` or `0`.
411///
412/// # Safety
413/// - `vd` and `vs2` are validly aligned and non-overlapping (verified by caller).
414/// - `vl <= vlmax`.
415/// - When `vm=false`: `vd.to_bits() != 0`.
416#[inline(always)]
417#[doc(hidden)]
418pub unsafe fn execute_rgather_scalar<Reg, ExtState, CustomError>(
419    ext_state: &mut ExtState,
420    vd: VReg,
421    vs2: VReg,
422    vm: bool,
423    sew: Vsew,
424    vlmax: u32,
425    index: u64,
426) where
427    Reg: Register,
428    ExtState: VectorRegistersExt<Reg, CustomError>,
429    [(); ExtState::ELEN as usize]:,
430    [(); ExtState::VLEN as usize]:,
431    [(); ExtState::VLENB as usize]:,
432    CustomError: fmt::Debug,
433{
434    let vl = ext_state.vl();
435    let vstart = ext_state.vstart();
436    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
437    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
438    // Pre-compute the gathered value; it's the same for all elements.
439    let val = if index < u64::from(vlmax) {
440        // SAFETY: index < vlmax <= group_regs * elems_per_reg for vs2
441        unsafe { read_element_u64(ext_state.read_vregs(), vs2, index as u32, sew) }
442    } else {
443        0u64
444    };
445    for i in u32::from(vstart)..vl {
446        if !mask_bit(&mask_buf, i) {
447            continue;
448        }
449        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
450        unsafe {
451            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
452        }
453    }
454    ext_state.mark_vs_dirty();
455    ext_state.reset_vstart();
456}
457
458/// Execute vrgatherei16.vv: `vd[i] = (vs1_16[i] < vlmax) ? vs2[vs1_16[i]] : 0`.
459///
460/// `vs1` always uses EEW=16 regardless of SEW. `vl` must not exceed the index register group
461/// capacity, i.e. `vl <= index_group_regs * VLENB / 2` (VLENB/2 = elems per register at EEW=16).
462///
463/// # Safety
464/// - `vd`, `vs2`, and `vs1` are validly aligned and mutually non-overlapping (verified by caller).
465/// - `vl <= vlmax` (for the data register group) AND `vl <= index_group_regs * VLENB / 2` (for the
466///   index register group).
467/// - When `vm=false`: `vd.to_bits() != 0`.
468#[inline(always)]
469#[expect(clippy::too_many_arguments, reason = "Internal API")]
470#[doc(hidden)]
471pub unsafe fn execute_rgatherei16<Reg, ExtState, CustomError>(
472    ext_state: &mut ExtState,
473    vd: VReg,
474    vs2: VReg,
475    vs1: VReg,
476    vm: bool,
477    sew: Vsew,
478    vlmax: u32,
479    index_group_regs: u8,
480) where
481    Reg: Register,
482    ExtState: VectorRegistersExt<Reg, CustomError>,
483    [(); ExtState::ELEN as usize]:,
484    [(); ExtState::VLEN as usize]:,
485    [(); ExtState::VLENB as usize]:,
486    CustomError: fmt::Debug,
487{
488    let vl = ext_state.vl();
489    let vstart = ext_state.vstart();
490    // Maximum number of EEW=16 elements the index register group can hold.
491    // Each register holds VLENB / 2 elements at EEW=16.
492    let index_capacity = u32::from(index_group_regs) * (ExtState::VLENB / 2);
493    // `vl` must not exceed either the data VLMAX or the index register group capacity.
494    // Both bounds are guaranteed by the caller; this debug assertion catches misuse early.
495    debug_assert!(
496        vl <= vlmax && vl <= index_capacity,
497        "vl={vl} exceeds vlmax={vlmax} or index_capacity={index_capacity}"
498    );
499    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`
500    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
501    for i in u32::from(vstart)..vl {
502        if !mask_bit(&mask_buf, i) {
503            continue;
504        }
505        // Read 16-bit index from vs1; EEW=16 always.
506        // SAFETY: i < vl <= index_capacity = index_group_regs * (VLENB/2), so element i
507        // fits within the index register group.
508        let index = unsafe { read_element_u64(ext_state.read_vregs(), vs1, i, Vsew::E16) };
509        let val = if index < u64::from(vlmax) {
510            // SAFETY: index < vlmax <= group_regs * elems_per_reg for vs2
511            unsafe { read_element_u64(ext_state.read_vregs(), vs2, index as u32, sew) }
512        } else {
513            0u64
514        };
515        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
516        unsafe {
517            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
518        }
519    }
520    ext_state.mark_vs_dirty();
521    ext_state.reset_vstart();
522}
523
524/// Execute vmerge.vvm / vmv.v.v.
525///
526/// When `vm=true` (vmv.v.v): all active elements `vstart..vl` get `vs1[i]`; vs2 unused.
527/// When `vm=false` (vmerge.vvm): active elements where `v0[i]=1` get `vs1[i]`,
528/// inactive elements get `vs2[i]`.
529///
530/// # Safety
531/// - `vd` and `vs1` are validly aligned (verified by caller).
532/// - When `vm=false`: `vs2` is validly aligned and `vd` does not overlap v0 (verified by caller).
533/// - `vl <= group_regs * VLENB / sew_bytes`.
534#[inline(always)]
535#[doc(hidden)]
536pub unsafe fn execute_merge_vv<Reg, ExtState, CustomError>(
537    ext_state: &mut ExtState,
538    vd: VReg,
539    vs2: VReg,
540    vs1: VReg,
541    vm: bool,
542    sew: Vsew,
543) where
544    Reg: Register,
545    ExtState: VectorRegistersExt<Reg, CustomError>,
546    [(); ExtState::ELEN as usize]:,
547    [(); ExtState::VLEN as usize]:,
548    [(); ExtState::VLENB as usize]:,
549    CustomError: fmt::Debug,
550{
551    let vl = ext_state.vl();
552    let vstart = ext_state.vstart();
553    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`.
554    // For vmv.v.v (vm=true) the mask is all-ones so snapshot_mask is still valid.
555    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
556    for i in u32::from(vstart)..vl {
557        let mask_set = mask_bit(&mask_buf, i);
558        let val = if mask_set {
559            // SAFETY: i < vl <= group_regs * elems_per_reg for vs1
560            unsafe { read_element_u64(ext_state.read_vregs(), vs1, i, sew) }
561        } else {
562            // mask_set=false only reachable when vm=false (vmerge path).
563            // SAFETY: i < vl <= group_regs * elems_per_reg for vs2
564            unsafe { read_element_u64(ext_state.read_vregs(), vs2, i, sew) }
565        };
566        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
567        unsafe {
568            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
569        }
570    }
571    ext_state.mark_vs_dirty();
572    ext_state.reset_vstart();
573}
574
575/// Execute vmerge.vxm / vmerge.vim / vmv.v.x / vmv.v.i.
576///
577/// When `vm=true`: all active elements `vstart..vl` get `scalar`; vs2 unused.
578/// When `vm=false`: active elements where `v0[i]=1` get `scalar`,
579/// inactive elements get `vs2[i]`.
580///
581/// # Safety
582/// - `vd` is validly aligned (verified by caller).
583/// - When `vm=false`: `vs2` is validly aligned and `vd` does not overlap v0 (verified by caller).
584/// - `vl <= group_regs * VLENB / sew_bytes`.
585#[inline(always)]
586#[doc(hidden)]
587pub unsafe fn execute_merge_scalar<Reg, ExtState, CustomError>(
588    ext_state: &mut ExtState,
589    vd: VReg,
590    vs2: VReg,
591    vm: bool,
592    sew: Vsew,
593    scalar: u64,
594) where
595    Reg: Register,
596    ExtState: VectorRegistersExt<Reg, CustomError>,
597    [(); ExtState::ELEN as usize]:,
598    [(); ExtState::VLEN as usize]:,
599    [(); ExtState::VLENB as usize]:,
600    CustomError: fmt::Debug,
601{
602    let vl = ext_state.vl();
603    let vstart = ext_state.vstart();
604    // SAFETY: `vl <= VLEN`, so `vl.div_ceil(8) <= VLENB`.
605    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
606
607    for i in u32::from(vstart)..vl {
608        let val = if mask_bit(&mask_buf, i) {
609            scalar
610        } else {
611            // SAFETY: i < vl <= group_regs * elems_per_reg for vs2
612            unsafe { read_element_u64(ext_state.read_vregs(), vs2, i, sew) }
613        };
614        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
615        unsafe {
616            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
617        }
618    }
619    ext_state.mark_vs_dirty();
620    ext_state.reset_vstart();
621}
622
623/// Execute vcompress.vm: pack active elements of vs2 (under vs1 mask) sequentially into vd.
624///
625/// `vs1` is treated as an explicit mask register (single register, not LMUL-grouped).
626/// The output write index increments only for elements where `vs1[i]` is set.
627/// vd must not overlap vs1 or vs2.
628///
629/// # Safety
630/// - `vd`, `vs2` are validly aligned and non-overlapping (verified by caller).
631/// - `vs1` does not overlap `vd` (verified by caller).
632/// - `vl <= VLMAX`.
633#[inline(always)]
634#[doc(hidden)]
635pub unsafe fn execute_compress<Reg, ExtState, CustomError>(
636    ext_state: &mut ExtState,
637    vd: VReg,
638    vs2: VReg,
639    vs1: VReg,
640    vl: u32,
641    sew: Vsew,
642) where
643    Reg: Register,
644    ExtState: VectorRegistersExt<Reg, CustomError>,
645    [(); ExtState::ELEN as usize]:,
646    [(); ExtState::VLEN as usize]:,
647    [(); ExtState::VLENB as usize]:,
648    CustomError: fmt::Debug,
649{
650    let mask_bytes = vl.div_ceil(u8::BITS) as usize;
651    let vreg = ext_state.read_vregs();
652    let mut vs1_buf = [0u8; { ExtState::VLENB as usize }];
653    // SAFETY: mask_bytes <= VLENB since vl <= VLEN; vs1_base < 32
654    unsafe {
655        vs1_buf
656            .get_unchecked_mut(..mask_bytes)
657            .copy_from_slice(vreg.get(vs1).get_unchecked(..mask_bytes));
658    }
659    let mut out_idx = 0u32;
660    for i in 0..vl {
661        if !mask_bit(&vs1_buf, i) {
662            continue;
663        }
664        // SAFETY: i < vl <= group_regs * elems_per_reg
665        let val = unsafe { read_element_u64(ext_state.read_vregs(), vs2, i, sew) };
666        // SAFETY: out_idx <= popcount(vs1[0..vl)) <= vl
667        unsafe {
668            write_element_u64(ext_state.write_vregs(), vd, out_idx, sew, val);
669        }
670        out_idx += 1;
671    }
672    ext_state.mark_vs_dirty();
673    ext_state.reset_vstart();
674}
675
676/// Copy `COUNT` whole vector registers from `src_base` to `dst_base`.
677///
678/// No masking, no vtype dependency. Uses snapshot semantics: all source registers are read into
679/// a stack buffer before any destination registers are written, giving correct memmove-style
680/// behaviour for all overlap patterns (including partial overlap such as src=V0, dst=V1, count=2).
681///
682/// # Safety
683/// - `dst_base + COUNT <= 32` and `src_base + COUNT <= 32` (verified by caller via alignment
684///   checks).
685/// - `dst_base % COUNT == 0` and `src_base % COUNT == 0` (verified by caller).
686#[inline(always)]
687#[doc(hidden)]
688pub unsafe fn execute_whole_reg_move<const COUNT: usize, const VLENB: usize>(
689    vregs: &mut VectorRegisterFile<VLENB>,
690    dst_base: VReg,
691    src_base: VReg,
692) {
693    // Snapshot all source registers before writing any destination registers.
694    // This is correct for all overlap patterns without direction-dependent logic.
695    let mut tmp = [[0u8; VLENB]; COUNT];
696    for (k, item) in tmp.iter_mut().enumerate() {
697        // SAFETY: Guaranteed by function contract
698        let src = unsafe { VReg::from_bits(src_base.to_bits() + k as u8).unwrap_unchecked() };
699        *item = *vregs.get(src);
700    }
701    for (k, item) in tmp.iter().enumerate() {
702        // SAFETY: Guaranteed by function contract
703        let dst = unsafe { VReg::from_bits(dst_base.to_bits() + k as u8).unwrap_unchecked() };
704        *vregs.get_mut(dst) = *item;
705    }
706}