ab_riscv_interpreter/v/zvexx/perm/
zvexx_perm_helpers.rs

1//! Opaque helpers for ZveXx extension
2
3use crate::v::vector_registers::{VLENB_USIZE, VectorRegisterFile, VectorRegistersExt};
4pub use crate::v::zvexx::arith::zvexx_arith_helpers::check_vreg_group_alignment;
5use crate::v::zvexx::arith::zvexx_arith_helpers::{read_element_u64, write_element_u64};
6use crate::v::zvexx::load::zvexx_load_helpers::{mask_bit, snapshot_mask};
7use crate::v::zvexx::zvexx_helpers::INSTRUCTION_SIZE;
8use crate::{ExecutionError, ProgramCounter};
9use ab_riscv_primitives::prelude::*;
10use core::fmt;
11use core::hint::cold_path;
12use core::num::NonZeroU8;
13
14/// Check that register groups `[a, a+count)` and `[b, b+count)` do not overlap.
15///
16/// Both groups must have the same size `count`. For groups of different sizes use
17/// [`check_no_overlap_asymmetric`].
18#[inline(always)]
19#[doc(hidden)]
20#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
21pub fn check_no_overlap<Reg, Memory, PC, CustomError>(
22    program_counter: &PC,
23    a: VReg,
24    b: VReg,
25    count: NonZeroU8,
26) -> Result<(), ExecutionError<Reg::Type, CustomError>>
27where
28    Reg: Register,
29    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
30{
31    let a_start = u16::from(a.to_bits());
32    let b_start = u16::from(b.to_bits());
33    let count = u16::from(count.get());
34    // Intervals [a_start, a_start+count) and [b_start, b_start+count) overlap iff
35    // each starts before the other ends. Arithmetic is widened to u16 to avoid u8 overflow
36    // (e.g., b_start=30 + count=8 = 38, which overflows u8).
37    if a_start < b_start + count && b_start < a_start + count {
38        cold_path();
39        return Err(ExecutionError::IllegalInstruction {
40            address: program_counter.old_pc(INSTRUCTION_SIZE),
41        });
42    }
43    Ok(())
44}
45
46/// Check that register group `[a, a+a_count)` does not overlap `[b, b+b_count)`.
47///
48/// Unlike [`check_no_overlap`], the two groups are allowed to have different sizes.
49/// Used for `vrgatherei16.vv` where vd/vs2 use LMUL-derived `group_regs` and vs1
50/// uses EEW=16-derived `index_group_regs`.
51#[inline(always)]
52#[doc(hidden)]
53#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
54pub fn check_no_overlap_asymmetric<Reg, Memory, PC, CustomError>(
55    program_counter: &PC,
56    a: VReg,
57    a_count: NonZeroU8,
58    b: VReg,
59    b_count: NonZeroU8,
60) -> Result<(), ExecutionError<Reg::Type, CustomError>>
61where
62    Reg: Register,
63    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
64{
65    let a_start = u16::from(a.to_bits());
66    let b_start = u16::from(b.to_bits());
67    let a_count = u16::from(a_count.get());
68    let b_count = u16::from(b_count.get());
69    // Intervals [a_start, a_start+a_count) and [b_start, b_start+b_count) overlap iff
70    // each starts before the other ends.
71    if a_start < b_start + b_count && b_start < a_start + a_count {
72        cold_path();
73        return Err(ExecutionError::IllegalInstruction {
74            address: program_counter.old_pc(INSTRUCTION_SIZE),
75        });
76    }
77    Ok(())
78}
79
80/// Read element 0 of register `base_reg` as `u64`, zero-extended.
81///
82/// # Safety
83/// `sew.bytes() <= VLEN.bytes()`
84#[inline(always)]
85#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
86pub unsafe fn read_element_0_u64<const VLEN: Vlen>(
87    vregs: &VectorRegisterFile<VLEN>,
88    base_reg: VReg,
89    sew: Vsew,
90) -> u64 {
91    let sew_bytes = usize::from(sew.bytes_width());
92    let reg = vregs.get(base_reg);
93    let mut buf = [0u8; 8];
94    // SAFETY: `sew_bytes <= VLEN.bytes()` for all legal vtype; `sew_bytes <= 8`
95    unsafe {
96        buf.get_unchecked_mut(..sew_bytes)
97            .copy_from_slice(reg.get_unchecked(..sew_bytes));
98    }
99    u64::from_le_bytes(buf)
100}
101
102/// Write element 0 of register `base_reg` from the low `sew_bytes` of `value`.
103///
104/// # Safety
105/// `sew.bytes() <= VLEN.bytes()`
106#[inline(always)]
107#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
108pub unsafe fn write_element_0_u64<const VLEN: Vlen>(
109    vregs: &mut VectorRegisterFile<VLEN>,
110    base_reg: VReg,
111    sew: Vsew,
112    value: u64,
113) {
114    let sew_bytes = usize::from(sew.bytes_width());
115    let buf = value.to_le_bytes();
116    let reg = vregs.get_mut(base_reg);
117    // SAFETY: `sew_bytes <= VLEN.bytes()`; `sew_bytes <= 8`
118    unsafe {
119        reg.get_unchecked_mut(..sew_bytes)
120            .copy_from_slice(buf.get_unchecked(..sew_bytes));
121    }
122}
123
124/// Sign-extend the low `sew.bits_width()` of `val` to the register type width.
125///
126/// The arithmetic is performed entirely in 64-bit signed integer space: we shift the SEW-wide
127/// value left to place its sign bit at bit 63, then arithmetic-right-shift back to propagate it.
128/// The resulting `u64` is then narrowed to `Reg::Type` (32 or 64 bits) by combining via
129/// `From<u32>` - the only integer conversion in the `Register::Type` trait bounds.
130///
131/// For RV32 (`Reg::XLEN == 32`) the low 32 bits are already the correct sign-extended result
132/// because the arithmetic shift propagates the sign across all 64 bits and then we discard the
133/// upper half.
134///
135/// For RV64 (`Reg::XLEN == 64`) we must preserve all 64 bits. Since `Reg::Type: From<u32>` and
136/// `Reg::Type: Shl<u8>`, we reconstruct the 64-bit value by OR-ing two 32-bit halves shifted
137/// into position.
138#[inline(always)]
139#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
140pub fn sign_extend_to_reg<Reg>(val: u64, sew: Vsew) -> Reg::Type
141where
142    Reg: Register,
143{
144    let sew_bits = u32::from(sew.bits_width());
145    // `shift` is in [0, 64). When sew_bits == 64, shift == 0 and the value is unchanged.
146    let shift = u64::BITS - sew_bits;
147    // Cast to i64 so the right-shift is arithmetic (sign-extending).
148    let sign_extended = (val.cast_signed() << shift) >> shift;
149    let raw = sign_extended.cast_unsigned();
150    if Reg::XLEN == u64::BITS as u8 {
151        // RV64: preserve all 64 bits by splitting into two u32 halves.
152        let lo = Reg::Type::from(raw as u32);
153        let hi = Reg::Type::from((raw >> u32::BITS) as u32);
154        lo | (hi << 32u8)
155    } else {
156        // RV32: the low 32 bits are the correctly truncated result.
157        Reg::Type::from(raw as u32)
158    }
159}
160
161/// Execute a vslideup operation.
162///
163/// Elements `vstart..min(offset, vl)` in vd are unchanged.
164/// Elements `max(vstart, offset)..vl` where mask is active get vs2[i - offset].
165///
166/// # Safety
167/// - `vd` and `vs2` are validly aligned and non-overlapping (verified by caller).
168/// - `vl <= group_regs * VLEN.bytes() / sew_bytes`.
169/// - When `vm=false`: `vd.to_bits() != 0`.
170#[inline(always)]
171#[doc(hidden)]
172// TODO: #[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
173pub unsafe fn execute_slideup<Reg, ExtState, CustomError>(
174    ext_state: &mut ExtState,
175    vd: VReg,
176    vs2: VReg,
177    vm: bool,
178    sew: Vsew,
179    offset: u64,
180) where
181    Reg: Register,
182    ExtState: VectorRegistersExt<Reg, CustomError>,
183    [(); SUPPORTED_ELEN_VLEN::<{ ExtState::ELEN }, { ExtState::VLEN }>]:,
184    CustomError: fmt::Debug,
185{
186    let vl = ext_state.vl();
187    let vstart = ext_state.vstart();
188    // SAFETY: `vl <= VLEN`
189    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
190    // Per spec §16.3.1: elements 0..offset are never written (vd keeps its value).
191    // The active range starts at max(vstart, offset).
192    for i in vstart
193        .max(Vstart::from(offset.saturating_truncate::<u16>()))
194        .range_to(vl)
195    {
196        if !mask_bit(&mask_buf, i) {
197            continue;
198        }
199        let src_idx = i - offset.saturating_truncate::<u16>();
200        // SAFETY: src_idx < vl <= group_regs * elems_per_reg, so source element is in range
201        let val = unsafe { read_element_u64(ext_state.read_vregs(), vs2, src_idx, sew) };
202        // SAFETY: i < vl <= group_regs * elems_per_reg, so dest element is in range
203        unsafe {
204            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
205        }
206    }
207    ext_state.mark_vs_dirty();
208    ext_state.reset_vstart();
209}
210
211/// Execute a vslidedown operation.
212///
213/// Element `vd[i] = vs2[i + offset]` if `i + offset < vlmax`, else `0`.
214///
215/// # Safety
216/// - `vd` and `vs2` are validly aligned (verified by caller); overlap is permitted.
217/// - `vl <= vlmax`.
218/// - When `vm=false`: `vd.to_bits() != 0`.
219#[inline(always)]
220#[doc(hidden)]
221// TODO: #[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
222pub unsafe fn execute_slidedown<Reg, ExtState, CustomError>(
223    ext_state: &mut ExtState,
224    vd: VReg,
225    vs2: VReg,
226    vm: bool,
227    sew: Vsew,
228    vlmax: Vl,
229    offset: u64,
230) where
231    Reg: Register,
232    ExtState: VectorRegistersExt<Reg, CustomError>,
233    [(); SUPPORTED_ELEN_VLEN::<{ ExtState::ELEN }, { ExtState::VLEN }>]:,
234    CustomError: fmt::Debug,
235{
236    let vl = ext_state.vl();
237    let vstart = ext_state.vstart();
238    // SAFETY: `vl <= VLEN`
239    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
240    for i in vstart.range_to(vl) {
241        if !mask_bit(&mask_buf, i) {
242            continue;
243        }
244        // Use checked_add to guard against offset being so large that i + offset overflows u64.
245        // Any value that wraps past u64::MAX is trivially >= vlmax, so the spec requires vd[i]=0.
246        let val = if let Some(src_idx) = u64::from(i).checked_add(offset)
247            && src_idx < u64::from(vlmax)
248        {
249            // SAFETY: src_idx < vlmax <= group_regs * elems_per_reg, so element is in range
250            unsafe { read_element_u64(ext_state.read_vregs(), vs2, src_idx as u16, sew) }
251        } else {
252            0
253        };
254        // SAFETY: i < vl <= vlmax <= group_regs * elems_per_reg
255        unsafe {
256            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
257        }
258    }
259    ext_state.mark_vs_dirty();
260    ext_state.reset_vstart();
261}
262
263/// Execute a vslide1up operation.
264///
265/// Element 0 of vd gets `scalar` (when active and vl > 0).
266/// Element `i` for `1 <= i < vl` gets `vs2[i - 1]`.
267/// vd must not overlap vs2.
268///
269/// # Safety
270/// - `vd` and `vs2` are validly aligned and non-overlapping (verified by caller).
271/// - `vl <= group_regs * VLEN.bytes() / sew_bytes`.
272/// - When `vm=false`: `vd.to_bits() != 0`.
273#[inline(always)]
274#[doc(hidden)]
275// TODO: #[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
276pub unsafe fn execute_slide1up<Reg, ExtState, CustomError>(
277    ext_state: &mut ExtState,
278    vd: VReg,
279    vs2: VReg,
280    vm: bool,
281    sew: Vsew,
282    scalar: u64,
283) where
284    Reg: Register,
285    ExtState: VectorRegistersExt<Reg, CustomError>,
286    [(); SUPPORTED_ELEN_VLEN::<{ ExtState::ELEN }, { ExtState::VLEN }>]:,
287    CustomError: fmt::Debug,
288{
289    let vl = ext_state.vl();
290    let vstart = ext_state.vstart();
291    // SAFETY: `vl <= VLEN`
292    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
293    for i in vstart.range_to(vl) {
294        if !mask_bit(&mask_buf, i) {
295            continue;
296        }
297        let val = if i == 0 {
298            scalar
299        } else {
300            // SAFETY: i - 1 < vl <= group_regs * elems_per_reg
301            unsafe { read_element_u64(ext_state.read_vregs(), vs2, i - 1, sew) }
302        };
303        // SAFETY: i < vl <= group_regs * elems_per_reg
304        unsafe {
305            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
306        }
307    }
308    ext_state.mark_vs_dirty();
309    ext_state.reset_vstart();
310}
311
312/// Execute a vslide1down operation.
313///
314/// Element `vd[i] = vs2[i + 1]` for `i < vl - 1`; element `vd[vl - 1]` gets `scalar`.
315///
316/// Overlap between `vd` and `vs2` is permitted by the spec. When they share the same register
317/// group base (exact overlap), ascending iteration is still correct: each write goes to byte range
318/// `[i*sew, (i+1)*sew)` while the subsequent read comes from `[(i+1)*sew, (i+2)*sew)`. These
319/// ranges are adjacent and non-overlapping, so writing element `i` never corrupts the source bytes
320/// of element `i+1`.
321///
322/// # Safety
323/// - `vd` and `vs2` are validly aligned (verified by caller); overlap is permitted.
324/// - `vl <= group_regs * VLEN.bytes() / sew_bytes`.
325/// - When `vm=false`: `vd.to_bits() != 0`.
326#[inline(always)]
327#[doc(hidden)]
328// TODO: #[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
329pub unsafe fn execute_slide1down<Reg, ExtState, CustomError>(
330    ext_state: &mut ExtState,
331    vd: VReg,
332    vs2: VReg,
333    vm: bool,
334    sew: Vsew,
335    scalar: u64,
336) where
337    Reg: Register,
338    ExtState: VectorRegistersExt<Reg, CustomError>,
339    [(); SUPPORTED_ELEN_VLEN::<{ ExtState::ELEN }, { ExtState::VLEN }>]:,
340    CustomError: fmt::Debug,
341{
342    let vl = ext_state.vl();
343    let vstart = ext_state.vstart();
344    // SAFETY: `vl <= VLEN`
345    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
346    let range = vstart.range_to(vl);
347    for i in range.clone() {
348        if !mask_bit(&mask_buf, i) {
349            continue;
350        }
351        let val = if i < *range.end() {
352            // SAFETY: i + 1 < vl <= group_regs * elems_per_reg
353            unsafe { read_element_u64(ext_state.read_vregs(), vs2, i + 1, sew) }
354        } else {
355            scalar
356        };
357        // SAFETY: i < vl <= group_regs * elems_per_reg
358        unsafe {
359            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
360        }
361    }
362    ext_state.mark_vs_dirty();
363    ext_state.reset_vstart();
364}
365
366/// Execute vrgather.vv: `vd[i] = (vs1[i] < vlmax) ? vs2[vs1[i]] : 0`.
367///
368/// # Safety
369/// - `vd`, `vs2`, and `vs1` are validly aligned and mutually non-overlapping (verified by caller).
370/// - `vl <= vlmax`.
371/// - When `vm=false`: `vd.to_bits() != 0`.
372#[inline(always)]
373#[doc(hidden)]
374// TODO: #[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
375pub unsafe fn execute_rgather_vv<Reg, ExtState, CustomError>(
376    ext_state: &mut ExtState,
377    vd: VReg,
378    vs2: VReg,
379    vs1: VReg,
380    vm: bool,
381    sew: Vsew,
382    vlmax: Vl,
383) where
384    Reg: Register,
385    ExtState: VectorRegistersExt<Reg, CustomError>,
386    [(); SUPPORTED_ELEN_VLEN::<{ ExtState::ELEN }, { ExtState::VLEN }>]:,
387    CustomError: fmt::Debug,
388{
389    let vl = ext_state.vl();
390    let vstart = ext_state.vstart();
391    // SAFETY: `vl <= VLEN`
392    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
393    for i in vstart.range_to(vl) {
394        if !mask_bit(&mask_buf, i) {
395            continue;
396        }
397        // SAFETY: i < vl <= group_regs * elems_per_reg for vs1
398        let index = unsafe { read_element_u64(ext_state.read_vregs(), vs1, i, sew) };
399        let val = if index < u64::from(vlmax) {
400            // SAFETY: index < vlmax <= group_regs * elems_per_reg for vs2
401            unsafe { read_element_u64(ext_state.read_vregs(), vs2, index as u16, sew) }
402        } else {
403            0u64
404        };
405        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
406        unsafe {
407            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
408        }
409    }
410    ext_state.mark_vs_dirty();
411    ext_state.reset_vstart();
412}
413
414/// Execute vrgather.vx / vrgather.vi: all active elements get `vs2[index]` or `0`.
415///
416/// # Safety
417/// - `vd` and `vs2` are validly aligned and non-overlapping (verified by caller).
418/// - `vl <= vlmax`.
419/// - When `vm=false`: `vd.to_bits() != 0`.
420#[inline(always)]
421#[doc(hidden)]
422// TODO: #[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
423pub unsafe fn execute_rgather_scalar<Reg, ExtState, CustomError>(
424    ext_state: &mut ExtState,
425    vd: VReg,
426    vs2: VReg,
427    vm: bool,
428    sew: Vsew,
429    vlmax: Vl,
430    index: u64,
431) where
432    Reg: Register,
433    ExtState: VectorRegistersExt<Reg, CustomError>,
434    [(); SUPPORTED_ELEN_VLEN::<{ ExtState::ELEN }, { ExtState::VLEN }>]:,
435    CustomError: fmt::Debug,
436{
437    let vl = ext_state.vl();
438    let vstart = ext_state.vstart();
439    // SAFETY: `vl <= VLEN`
440    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
441    // Pre-compute the gathered value; it's the same for all elements.
442    let val = if index < u64::from(vlmax) {
443        // SAFETY: index < vlmax <= group_regs * elems_per_reg for vs2
444        unsafe { read_element_u64(ext_state.read_vregs(), vs2, index as u16, sew) }
445    } else {
446        0u64
447    };
448    for i in vstart.range_to(vl) {
449        if !mask_bit(&mask_buf, i) {
450            continue;
451        }
452        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
453        unsafe {
454            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
455        }
456    }
457    ext_state.mark_vs_dirty();
458    ext_state.reset_vstart();
459}
460
461/// Execute vrgatherei16.vv: `vd[i] = (vs1_16[i] < vlmax) ? vs2[vs1_16[i]] : 0`.
462///
463/// `vs1` always uses EEW=16 regardless of SEW. `vl` must not exceed the index register group
464/// capacity, i.e. `vl <= index_group_regs * VLEN.bytes() / 2` (VLEN.bytes() / 2 = elems per
465/// register at EEW=16).
466///
467/// # Safety
468/// - `vd`, `vs2`, and `vs1` are validly aligned and mutually non-overlapping (verified by caller).
469/// - `vl <= vlmax` (for the data register group) AND `vl <= index_group_regs * VLEN.bytes() / 2`
470///   (for the index register group).
471/// - When `vm=false`: `vd.to_bits() != 0`.
472#[inline(always)]
473#[expect(clippy::too_many_arguments, reason = "Internal API")]
474#[doc(hidden)]
475// TODO: #[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
476pub unsafe fn execute_rgatherei16<Reg, ExtState, CustomError>(
477    ext_state: &mut ExtState,
478    vd: VReg,
479    vs2: VReg,
480    vs1: VReg,
481    vm: bool,
482    sew: Vsew,
483    vlmax: Vl,
484    index_group_regs: NonZeroU8,
485) where
486    Reg: Register,
487    ExtState: VectorRegistersExt<Reg, CustomError>,
488    [(); SUPPORTED_ELEN_VLEN::<{ ExtState::ELEN }, { ExtState::VLEN }>]:,
489    CustomError: fmt::Debug,
490{
491    let index_group_regs = index_group_regs.get();
492    let vl = ext_state.vl();
493    let vstart = ext_state.vstart();
494    // Maximum number of EEW=16 elements the index register group can hold.
495    // Each register holds VLEN.bytes() / 2 elements at EEW=16.
496    let index_capacity = u32::from(index_group_regs) * (ExtState::VLEN.bytes() / 2);
497    // `vl` must not exceed either the data VLMAX or the index register group capacity.
498    // Both bounds are guaranteed by the caller; this debug assertion catches misuse early.
499    debug_assert!(
500        vl <= vlmax && u32::from(vl) <= index_capacity,
501        "vl={vl} exceeds vlmax={vlmax} or index_capacity={index_capacity}"
502    );
503    // SAFETY: `vl <= VLEN`
504    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
505    for i in vstart.range_to(vl) {
506        if !mask_bit(&mask_buf, i) {
507            continue;
508        }
509        // Read 16-bit index from vs1; EEW=16 always.
510        // SAFETY: i < vl <= index_capacity = index_group_regs * (VLEN.bytes() / 2), so element i
511        // fits within the index register group.
512        let index = unsafe { read_element_u64(ext_state.read_vregs(), vs1, i, Vsew::E16) };
513        let val = if index < u64::from(vlmax) {
514            // SAFETY: index < vlmax <= group_regs * elems_per_reg for vs2
515            unsafe { read_element_u64(ext_state.read_vregs(), vs2, index as u16, sew) }
516        } else {
517            0u64
518        };
519        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
520        unsafe {
521            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
522        }
523    }
524    ext_state.mark_vs_dirty();
525    ext_state.reset_vstart();
526}
527
528/// Execute vmerge.vvm / vmv.v.v.
529///
530/// When `vm=true` (vmv.v.v): all active elements `vstart..vl` get `vs1[i]`; vs2 unused.
531/// When `vm=false` (vmerge.vvm): active elements where `v0[i]=1` get `vs1[i]`,
532/// inactive elements get `vs2[i]`.
533///
534/// # Safety
535/// - `vd` and `vs1` are validly aligned (verified by caller).
536/// - When `vm=false`: `vs2` is validly aligned and `vd` does not overlap v0 (verified by caller).
537/// - `vl <= group_regs * VLEN.bytes() / sew_bytes`.
538#[inline(always)]
539#[doc(hidden)]
540// TODO: #[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
541pub unsafe fn execute_merge_vv<Reg, ExtState, CustomError>(
542    ext_state: &mut ExtState,
543    vd: VReg,
544    vs2: VReg,
545    vs1: VReg,
546    vm: bool,
547    sew: Vsew,
548) where
549    Reg: Register,
550    ExtState: VectorRegistersExt<Reg, CustomError>,
551    [(); SUPPORTED_ELEN_VLEN::<{ ExtState::ELEN }, { ExtState::VLEN }>]:,
552    CustomError: fmt::Debug,
553{
554    let vl = ext_state.vl();
555    let vstart = ext_state.vstart();
556    // SAFETY: `vl <= VLEN`
557    // For vmv.v.v (vm=true) the mask is all-ones so snapshot_mask is still valid.
558    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
559    for i in vstart.range_to(vl) {
560        let mask_set = mask_bit(&mask_buf, i);
561        let val = if mask_set {
562            // SAFETY: i < vl <= group_regs * elems_per_reg for vs1
563            unsafe { read_element_u64(ext_state.read_vregs(), vs1, i, sew) }
564        } else {
565            // mask_set=false only reachable when vm=false (vmerge path).
566            // SAFETY: i < vl <= group_regs * elems_per_reg for vs2
567            unsafe { read_element_u64(ext_state.read_vregs(), vs2, i, sew) }
568        };
569        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
570        unsafe {
571            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
572        }
573    }
574    ext_state.mark_vs_dirty();
575    ext_state.reset_vstart();
576}
577
578/// Execute vmerge.vxm / vmerge.vim / vmv.v.x / vmv.v.i.
579///
580/// When `vm=true`: all active elements `vstart..vl` get `scalar`; vs2 unused.
581/// When `vm=false`: active elements where `v0[i]=1` get `scalar`,
582/// inactive elements get `vs2[i]`.
583///
584/// # Safety
585/// - `vd` is validly aligned (verified by caller).
586/// - When `vm=false`: `vs2` is validly aligned and `vd` does not overlap v0 (verified by caller).
587/// - `vl <= group_regs * VLEN.bytes() / sew_bytes`.
588#[inline(always)]
589#[doc(hidden)]
590// TODO: #[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
591pub unsafe fn execute_merge_scalar<Reg, ExtState, CustomError>(
592    ext_state: &mut ExtState,
593    vd: VReg,
594    vs2: VReg,
595    vm: bool,
596    sew: Vsew,
597    scalar: u64,
598) where
599    Reg: Register,
600    ExtState: VectorRegistersExt<Reg, CustomError>,
601    [(); SUPPORTED_ELEN_VLEN::<{ ExtState::ELEN }, { ExtState::VLEN }>]:,
602    CustomError: fmt::Debug,
603{
604    let vl = ext_state.vl();
605    let vstart = ext_state.vstart();
606    // SAFETY: `vl <= VLEN`
607    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
608
609    for i in vstart.range_to(vl) {
610        let val = if mask_bit(&mask_buf, i) {
611            scalar
612        } else {
613            // SAFETY: i < vl <= group_regs * elems_per_reg for vs2
614            unsafe { read_element_u64(ext_state.read_vregs(), vs2, i, sew) }
615        };
616        // SAFETY: i < vl <= group_regs * elems_per_reg for vd
617        unsafe {
618            write_element_u64(ext_state.write_vregs(), vd, i, sew, val);
619        }
620    }
621    ext_state.mark_vs_dirty();
622    ext_state.reset_vstart();
623}
624
625/// Execute vcompress.vm: pack active elements of vs2 (under vs1 mask) sequentially into vd.
626///
627/// `vs1` is treated as an explicit mask register (single register, not LMUL-grouped).
628/// The output write index increments only for elements where `vs1[i]` is set.
629/// vd must not overlap vs1 or vs2.
630///
631/// # Safety
632/// - `vd`, `vs2` are validly aligned and non-overlapping (verified by caller).
633/// - `vs1` does not overlap `vd` (verified by caller).
634/// - `vl <= VLMAX`.
635#[inline(always)]
636#[doc(hidden)]
637// TODO: #[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
638pub unsafe fn execute_compress<Reg, ExtState, CustomError>(
639    ext_state: &mut ExtState,
640    vd: VReg,
641    vs2: VReg,
642    vs1: VReg,
643    vl: Vl,
644    sew: Vsew,
645) where
646    Reg: Register,
647    ExtState: VectorRegistersExt<Reg, CustomError>,
648    [(); SUPPORTED_ELEN_VLEN::<{ ExtState::ELEN }, { ExtState::VLEN }>]:,
649    CustomError: fmt::Debug,
650{
651    let mask_bytes = usize::from(vl.bytes());
652    let vreg = ext_state.read_vregs();
653    let mut vs1_buf = [0u8; VLENB_USIZE::<{ ExtState::VLEN }>];
654    // SAFETY: mask_bytes <= VLEN.bytes() since vl <= VLEN; vs1_base < 32
655    unsafe {
656        vs1_buf
657            .get_unchecked_mut(..mask_bytes)
658            .copy_from_slice(vreg.get(vs1).get_unchecked(..mask_bytes));
659    }
660    let mut out_idx = 0;
661    for i in Vstart::ZERO.range_to(vl) {
662        if !mask_bit(&vs1_buf, i) {
663            continue;
664        }
665        // SAFETY: i < vl <= group_regs * elems_per_reg
666        let val = unsafe { read_element_u64(ext_state.read_vregs(), vs2, i, sew) };
667        // SAFETY: out_idx <= popcount(vs1[0..vl)) <= vl
668        unsafe {
669            write_element_u64(ext_state.write_vregs(), vd, out_idx, sew, val);
670        }
671        out_idx += 1;
672    }
673    ext_state.mark_vs_dirty();
674    ext_state.reset_vstart();
675}
676
677/// Copy `COUNT` whole vector registers from `src_base` to `dst_base`.
678///
679/// No masking, no vtype dependency. Uses snapshot semantics: all source registers are read into
680/// a stack buffer before any destination registers are written, giving correct memmove-style
681/// behaviour for all overlap patterns (including partial overlap such as src=V0, dst=V1, count=2).
682///
683/// # Safety
684/// - `dst_base + COUNT <= 32` and `src_base + COUNT <= 32` (verified by caller via alignment
685///   checks).
686/// - `dst_base % COUNT == 0` and `src_base % COUNT == 0` (verified by caller).
687#[inline(always)]
688#[doc(hidden)]
689#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
690pub unsafe fn execute_whole_reg_move<const COUNT: usize, const VLEN: Vlen>(
691    vregs: &mut VectorRegisterFile<VLEN>,
692    dst_base: VReg,
693    src_base: VReg,
694) {
695    // Snapshot all source registers before writing any destination registers.
696    // This is correct for all overlap patterns without direction-dependent logic.
697    let mut tmp = [[0u8; _]; COUNT];
698    for (k, item) in tmp.iter_mut().enumerate() {
699        // SAFETY: Guaranteed by function contract
700        let src = unsafe { VReg::from_bits(src_base.to_bits() + k as u8).unwrap_unchecked() };
701        *item = *vregs.get(src);
702    }
703    for (k, item) in tmp.iter().enumerate() {
704        // SAFETY: Guaranteed by function contract
705        let dst = unsafe { VReg::from_bits(dst_base.to_bits() + k as u8).unwrap_unchecked() };
706        *vregs.get_mut(dst) = *item;
707    }
708}
ab_riscv_interpreter/v/zvexx/perm/zvexx_perm_helpers.rs

ab_riscv_interpreter/v/zvexx/perm/
zvexx_perm_helpers.rs