ab_riscv_interpreter/v/zvexx/fixed_point/
zvexx_fixed_point_helpers.rs

1//! Opaque helpers for ZveXx extension
2
3use crate::v::vector_registers::{VectorRegisterFile, VectorRegistersExt};
4pub use crate::v::zvexx::arith::zvexx_arith_helpers::{
5    OpSrc, check_vreg_group_alignment, sew_mask,
6};
7use crate::v::zvexx::arith::zvexx_arith_helpers::{
8    read_element_u64, sign_extend, write_element_u64,
9};
10use crate::v::zvexx::load::zvexx_load_helpers::{mask_bit, snapshot_mask};
11use crate::v::zvexx::zvexx_helpers::INSTRUCTION_SIZE;
12use crate::{ExecutionError, ProgramCounter};
13use ab_riscv_primitives::prelude::*;
14use core::fmt;
15use core::hint::cold_path;
16
17/// Compute the rounding increment for a right shift of `val` by `shift` bits.
18///
19/// When `shift == 0` there are no fractional bits so the increment is always zero.
20/// `current_result_lsb` is the LSB of the truncated result, required for `Rne` and `Rod`.
21#[inline(always)]
22#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
23fn round_increment(val: u64, shift: u32, mode: Vxrm, current_result_lsb: u64) -> u64 {
24    if shift == 0 {
25        return 0;
26    }
27    // `d_minus1_bit`: the most-significant discarded bit (bit position `shift - 1`)
28    let d_minus1_bit = (val >> (shift - 1)) & 1;
29    // `sticky`: OR of all bits below position `shift - 1`
30    let sticky = if shift >= 2 {
31        // Any of bits [shift-2 : 0] set?
32        (val & ((1u64 << (shift - 1)).wrapping_sub(1))) != 0
33    } else {
34        false
35    };
36    match mode {
37        // Round nearest up: increment = v[d-1]
38        Vxrm::Rnu => d_minus1_bit,
39        // Round nearest even: increment = v[d-1] & (sticky | result_lsb)
40        Vxrm::Rne => d_minus1_bit & u64::from(sticky || current_result_lsb != 0),
41        // Round down / truncate: never increment
42        Vxrm::Rdn => 0,
43        // Round to odd: set result LSB if any discarded bit was non-zero
44        Vxrm::Rod => u64::from(current_result_lsb == 0 && (d_minus1_bit != 0 || sticky)),
45    }
46}
47
48/// Perform a rounded right shift of `val` by `shift` bits (logical / unsigned).
49///
50/// Returns `(val >> shift) + round_increment`.
51#[inline(always)]
52#[doc(hidden)]
53#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
54pub fn rounded_srl(val: u64, shift: u32, mode: Vxrm) -> u64 {
55    let truncated = val >> shift;
56    let r = round_increment(val, shift, mode, truncated & 1);
57    truncated.wrapping_add(r)
58}
59
60/// Perform a rounded arithmetic right shift of `val` (sign-extended to SEW) by `shift` bits.
61///
62/// Returns the SEW-wide signed result as `u64` (sign bits above SEW are meaningful).
63#[inline(always)]
64#[doc(hidden)]
65#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
66pub fn rounded_sra(val: u64, shift: u32, mode: Vxrm, sew: Vsew) -> u64 {
67    let signed = sign_extend(val, sew);
68    // Treat the raw bits for rounding purposes: rounding uses the unsigned representation of the
69    // SEW-wide value (only bits below `shift` matter, so masking is not needed here since the
70    // discarded bits are the same regardless of sign extension).
71    let truncated_signed = signed >> shift;
72    let r = round_increment(val, shift, mode, truncated_signed.cast_unsigned() & 1);
73    truncated_signed.cast_unsigned().wrapping_add(r)
74}
75
76/// Saturating unsigned add: `vs2 + src`, clamped to `[0, 2^SEW - 1]`.
77///
78/// Sets `vxsat` to `true` on overflow.
79#[inline(always)]
80#[doc(hidden)]
81#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
82pub fn sat_addu(a: u64, b: u64, sew: Vsew, vxsat: &mut bool) -> u64 {
83    let mask = sew_mask(sew);
84    let a_w = a & mask;
85    let b_w = b & mask;
86    let result = a_w.wrapping_add(b_w);
87    if result & mask < a_w {
88        // Overflow: wrapped around
89        *vxsat = true;
90        mask
91    } else {
92        result & mask
93    }
94}
95
96/// Saturating signed add: `vs2 + src`, clamped to `[-(2^(SEW-1)), 2^(SEW-1) - 1]`.
97///
98/// Sets `vxsat` to `true` on overflow.
99#[inline(always)]
100#[doc(hidden)]
101#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
102pub fn sat_add(a: u64, b: u64, sew: Vsew, vxsat: &mut bool) -> u64 {
103    let sa = i128::from(sign_extend(a, sew));
104    let sb = i128::from(sign_extend(b, sew));
105    let result = sa.wrapping_add(sb);
106    let min_val = i128::MIN >> (i128::BITS - u32::from(sew.bits_width()));
107    let max_val = i128::MAX >> (i128::BITS - u32::from(sew.bits_width()));
108    if result < min_val {
109        *vxsat = true;
110        (min_val as i64).cast_unsigned() & sew_mask(sew)
111    } else if result > max_val {
112        *vxsat = true;
113        (max_val as i64).cast_unsigned() & sew_mask(sew)
114    } else {
115        (result as i64).cast_unsigned() & sew_mask(sew)
116    }
117}
118
119/// Saturating unsigned subtract: `vs2 - src`, clamped to `[0, 2^SEW - 1]`.
120///
121/// Sets `vxsat` to `true` on overflow (underflow to negative).
122#[inline(always)]
123#[doc(hidden)]
124#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
125pub fn sat_subu(a: u64, b: u64, sew: Vsew, vxsat: &mut bool) -> u64 {
126    let mask = sew_mask(sew);
127    let a_w = a & mask;
128    let b_w = b & mask;
129    if a_w < b_w {
130        *vxsat = true;
131        0
132    } else {
133        (a_w - b_w) & mask
134    }
135}
136
137/// Saturating signed subtract: `vs2 - src`, clamped to `[-(2^(SEW-1)), 2^(SEW-1) - 1]`.
138///
139/// Sets `vxsat` to `true` on overflow.
140#[inline(always)]
141#[doc(hidden)]
142#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
143pub fn sat_sub(a: u64, b: u64, sew: Vsew, vxsat: &mut bool) -> u64 {
144    let sa = i128::from(sign_extend(a, sew));
145    let sb = i128::from(sign_extend(b, sew));
146    let result = sa.wrapping_sub(sb);
147    let min_val = i128::MIN >> (i128::BITS - u32::from(sew.bits_width()));
148    let max_val = i128::MAX >> (i128::BITS - u32::from(sew.bits_width()));
149    if result < min_val {
150        *vxsat = true;
151        (min_val as i64).cast_unsigned() & sew_mask(sew)
152    } else if result > max_val {
153        *vxsat = true;
154        (max_val as i64).cast_unsigned() & sew_mask(sew)
155    } else {
156        (result as i64).cast_unsigned() & sew_mask(sew)
157    }
158}
159
160/// Averaging unsigned add: `(vs2 + src) >> 1` with rounding per `vxrm`.
161///
162/// Uses a 1-bit wider intermediate to avoid overflow; no saturation, no `vxsat`.
163#[inline(always)]
164#[doc(hidden)]
165#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
166pub fn avg_addu(a: u64, b: u64, sew: Vsew, mode: Vxrm) -> u64 {
167    let mask = sew_mask(sew);
168    let a_w = a & mask;
169    let b_w = b & mask;
170    // Compute full sum in one extra bit by using u128 or by widening trick.
171    // Since SEW <= 64 and both operands are SEW-bit values, the sum fits in SEW+1 bits.
172    // Use wrapping_add: the carry out of bit SEW-1 is the extra bit.
173    let sum = a_w.wrapping_add(b_w);
174    // Carry: set if unsigned sum overflowed SEW bits
175    let carry = u64::from(sum & mask < a_w);
176    // Full (SEW+1)-bit value: `carry` is at bit position SEW, `sum & mask` are low SEW bits.
177    // We need `(carry:sum) >> 1` with rounding.
178    // Bit 0 of `sum & mask` is the rounding bit for the truncated division.
179    let r = round_increment(sum & mask, 1, mode, (sum >> 1u8) & 1);
180    // Shift the (SEW+1)-bit quantity right by 1: result = (carry << (SEW-1)) | ((sum & mask) >> 1)
181    let shifted = (carry << (u32::from(sew.bits_width()) - 1)) | ((sum & mask) >> 1u8);
182    (shifted.wrapping_add(r)) & mask
183}
184
185/// Averaging signed add: `(vs2 + src) >> 1` with rounding per `vxrm`.
186///
187/// No saturation, no `vxsat`.
188#[inline(always)]
189#[doc(hidden)]
190#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
191pub fn avg_add(a: u64, b: u64, sew: Vsew, mode: Vxrm) -> u64 {
192    let sa = sign_extend(a, sew);
193    let sb = sign_extend(b, sew);
194    // Full sum as i128 to avoid overflow
195    let sum = i128::from(sa).wrapping_add(i128::from(sb));
196    // The low bit is the fractional bit for rounding
197    let r = match mode {
198        Vxrm::Rnu => (sum & 1).cast_unsigned() as u64,
199        Vxrm::Rne => {
200            // round-to-nearest-even: increment if fractional bit set AND (result LSB or sticky)
201            // For a single bit shift there are no lower sticky bits, so only check result LSB
202            let result_lsb = ((sum >> 1u8) & 1).cast_unsigned() as u64;
203            ((sum & 1).cast_unsigned() as u64) & result_lsb
204        }
205        Vxrm::Rdn => 0,
206        Vxrm::Rod => {
207            // Set result LSB if it would be 0 and the fractional bit is nonzero
208            let result_lsb = (sum >> 1u8) & 1;
209            u64::from(result_lsb == 0 && (sum & 1) != 0)
210        }
211    };
212    let result = (sum >> 1u8) + i128::from(r);
213    (result as i64).cast_unsigned() & sew_mask(sew)
214}
215
216/// Averaging unsigned subtract: `(vs2 - src) >> 1` with rounding per `vxrm`.
217///
218/// No saturation, no `vxsat`.
219#[inline(always)]
220#[doc(hidden)]
221#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
222pub fn avg_subu(a: u64, b: u64, sew: Vsew, mode: Vxrm) -> u64 {
223    let mask = sew_mask(sew);
224    let a_w = a & mask;
225    let b_w = b & mask;
226    // Compute difference with borrow using wrapping sub; borrow extends to SEW+1 bit.
227    let diff = a_w.wrapping_sub(b_w);
228    // Borrow: set if a < b (unsigned)
229    let borrow = u64::from(a_w < b_w);
230    // Full (SEW+1)-bit two's-complement difference:
231    // If borrow: the SEW-bit `diff` is correct (it wrapped), and the sign extension bit is 1.
232    // Rounding: bit 0 of diff is the fractional bit.
233    let r = round_increment(diff & mask, 1, mode, (diff >> 1u8) & 1);
234    // Arithmetic right shift by 1 of the (SEW+1)-bit signed value.
235    // For unsigned averaging subtract: result = ((SEW+1)-bit diff) / 2 with rounding.
236    // The (SEW+1)-bit value is: borrow is the sign bit. If borrow set, value is negative.
237    // Result = (borrow << SEW | diff) >> 1 (arithmetic) + r
238    // Arithmetic shift: sign bit (`borrow`) propagates.
239    let sign_fill = borrow.wrapping_neg(); // all ones if borrow set, zero otherwise
240    let shifted = (sign_fill << (u32::from(sew.bits_width()) - 1)) | ((diff & mask) >> 1u8);
241    (shifted.wrapping_add(r)) & mask
242}
243
244/// Averaging signed subtract: `(vs2 - src) >> 1` with rounding per `vxrm`.
245///
246/// No saturation, no `vxsat`.
247#[inline(always)]
248#[doc(hidden)]
249#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
250pub fn avg_sub(a: u64, b: u64, sew: Vsew, mode: Vxrm) -> u64 {
251    let sa = sign_extend(a, sew);
252    let sb = sign_extend(b, sew);
253    let diff = i128::from(sa).wrapping_sub(i128::from(sb));
254    let r = match mode {
255        Vxrm::Rnu => (diff & 1).cast_unsigned() as u64,
256        Vxrm::Rne => {
257            let result_lsb = ((diff >> 1u8) & 1).cast_unsigned() as u64;
258            ((diff & 1).cast_unsigned() as u64) & result_lsb
259        }
260        Vxrm::Rdn => 0,
261        Vxrm::Rod => {
262            let result_lsb = (diff >> 1u8) & 1;
263            u64::from(result_lsb == 0 && (diff & 1) != 0)
264        }
265    };
266    let result = (diff >> 1u8) + i128::from(r);
267    (result as i64).cast_unsigned() & sew_mask(sew)
268}
269
270/// Fractional multiply with rounding and saturation: `vsmul`.
271///
272/// Computes `(a * b * 2 + rounding) >> SEW`, saturating at the signed maximum when the
273/// product of two minimum signed values overflows (`INT_MIN * INT_MIN`).
274///
275/// Per spec §12.4: `vd[i] = clip(roundoff_signed(vs2[i] * vs1[i] * 2, SEW))`.
276/// Sets `vxsat` on overflow.
277#[inline(always)]
278#[doc(hidden)]
279#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
280pub fn smul(a: u64, b: u64, sew: Vsew, mode: Vxrm, vxsat: &mut bool) -> u64 {
281    // SEW-wide signed min and max in i64 (valid for all SEW <= 64)
282    let min_sew = i64::MIN >> (i64::BITS - u32::from(sew.bits_width()));
283    let max_sew = i64::MAX >> (i64::BITS - u32::from(sew.bits_width()));
284    let sa = i128::from(sign_extend(a, sew));
285    let sb = i128::from(sign_extend(b, sew));
286    // The only case where `product * 2` overflows a 2*SEW signed result is INT_MIN * INT_MIN.
287    // Detect this before any multiply: for SEW=64 INT64_MIN^2 = 2^126 and <<1 would overflow i128.
288    if sa == i128::from(min_sew) && sb == i128::from(min_sew) {
289        cold_path();
290        *vxsat = true;
291        return max_sew.cast_unsigned() & sew_mask(sew);
292    }
293    // Full 2*SEW-bit product; no overflow possible because at least one operand != INT_MIN,
294    // so |product| < INT_MIN^2 and the value fits in i128 for SEW <= 64.
295    let product = sa * sb;
296    // Left shift by 1 for the Q-format fractional interpretation; safe because
297    // |product| < INT_MIN^2, so after <<1 the result still fits in i128 for SEW <= 64.
298    let doubled = product << 1u8;
299    // Extract the low SEW bits (the discarded portion) for rounding.
300    // Cast to u128 first to avoid sign-extension contaminating the mask.
301    let shift = u32::from(sew.bits_width());
302    let low_bits = (doubled.cast_unsigned() & u128::from(sew_mask(sew))) as u64;
303    // Arithmetic right shift by SEW gives the truncated signed result in SEW-wide range.
304    let truncated = doubled >> shift;
305    let r = round_increment(
306        low_bits,
307        shift.min(64),
308        mode,
309        (truncated.cast_unsigned() as u64) & 1,
310    );
311    // `truncated` fits in i64 after the SEW-bit shift (it is a SEW-wide signed value).
312    let result = (truncated as i64).wrapping_add(r.cast_signed());
313    // Clamp to SEW-wide signed range (only reachable if rounding pushed the value over)
314    if result < min_sew {
315        *vxsat = true;
316        min_sew.cast_unsigned() & sew_mask(sew)
317    } else if result > max_sew {
318        *vxsat = true;
319        max_sew.cast_unsigned() & sew_mask(sew)
320    } else {
321        result.cast_unsigned() & sew_mask(sew)
322    }
323}
324
325/// Narrowing unsigned clip: read a 2*SEW element from `vs2`, shift right by `shamt` with
326/// rounding, saturate to unsigned SEW range, set `vxsat` on clamp.
327///
328/// `vs2_elem` is the 2*SEW-bit element (zero-extended to u64 for SEW <= 32;
329/// for SEW = 64 the doubled width would be 128 bits, but Zve64x only supports SEW up to 64 and
330/// the narrowing destination is at most 64 bits wide, so 2*SEW = 128 - however the spec requires
331/// `ELEN >= 2*SEW` for narrowing instructions. Since `ELEN = 64` in Zve64x, narrowing is only
332/// valid for SEW <= 32 (`2*SEW <= 64`).  The caller must enforce this constraint by checking
333/// `vsew` before invoking narrowing operations.
334///
335/// `vs2_elem` is passed as `u64`; for SEW = 32 it holds a 64-bit (2*SEW) value.
336#[inline(always)]
337#[doc(hidden)]
338#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
339pub fn nclipu(vs2_elem: u64, shamt: u32, sew: Vsew, mode: Vxrm, vxsat: &mut bool) -> u64 {
340    // Shift right with rounding
341    let shifted = rounded_srl(vs2_elem, shamt, mode);
342    // Saturate to destination SEW unsigned range [0, 2^SEW - 1]
343    let max_dst = sew_mask(sew);
344    if shifted > max_dst {
345        *vxsat = true;
346        max_dst
347    } else {
348        shifted & max_dst
349    }
350}
351
352/// Narrowing signed clip: read a 2*SEW signed element from `vs2`, shift right arithmetically
353/// with rounding, saturate to signed SEW range.
354///
355/// Same SEW constraint as [`nclipu`].
356#[inline(always)]
357#[doc(hidden)]
358#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
359pub fn nclip(vs2_elem: u64, shamt: u32, sew: Vsew, mode: Vxrm, vxsat: &mut bool) -> u64 {
360    // Sign-extend vs2_elem to full i64 treating it as a 2*SEW-bit signed value.
361    // For SEW=8 the source is 16-bit, for SEW=16 it is 32-bit, for SEW=32 it is 64-bit.
362    // TODO: Use `sew.double_width()`
363    let double_sew_bits = sew.bits_width() * 2;
364    let shift_amt = i64::BITS - u32::from(double_sew_bits);
365    let signed_wide = (vs2_elem.cast_signed() << shift_amt) >> shift_amt;
366    // Arithmetic right shift with rounding
367    // For rounding we need the raw low bits of the wide value before shifting
368    let low_bits = signed_wide.cast_unsigned()
369        & if double_sew_bits == 64 {
370            u64::MAX
371        } else {
372            (1u64 << double_sew_bits) - 1
373        };
374    let truncated = signed_wide >> shamt;
375    let r = round_increment(low_bits, shamt, mode, (truncated.cast_unsigned()) & 1);
376    let rounded = truncated.wrapping_add(r.cast_signed());
377    // Saturate to signed SEW range
378    let min_dst = i64::MIN >> (i64::BITS - u32::from(sew.bits_width()));
379    let max_dst = i64::MAX >> (i64::BITS - u32::from(sew.bits_width()));
380    if rounded < min_dst {
381        *vxsat = true;
382        min_dst.cast_unsigned() & sew_mask(sew)
383    } else if rounded > max_dst {
384        *vxsat = true;
385        max_dst.cast_unsigned() & sew_mask(sew)
386    } else {
387        rounded.cast_unsigned() & sew_mask(sew)
388    }
389}
390
391/// Read a 2*SEW-wide element as `u64` from the double-width source register group of a narrowing
392/// instruction.
393///
394/// For narrowing instructions `vs2` holds elements of width `2*SEW`. The register group size is
395/// `2 * group_regs`. Element `i` of width `2*SEW` is located in the same way as a SEW-wide
396/// element of width `2*SEW` (i.e., treating `2*SEW` as the element width). For `SEW = 32` this
397/// reads 64-bit elements; for `SEW <= 16` it reads narrower elements but zero-extends to `u64`.
398///
399/// # Safety
400/// - `2*SEW <= 64` (Zve64x constraint: only valid for SEW <= 32; caller must verify)
401/// - `base_reg + elem_i / (VLEN.bytes() / (2*sew_bytes)) < 32`
402#[inline(always)]
403#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
404pub unsafe fn read_wide_element_u64<const VLEN: Vlen>(
405    vregs: &VectorRegisterFile<VLEN>,
406    base_reg: VReg,
407    elem_i: u16,
408    sew: Vsew,
409) -> u64 {
410    let double_sew_bytes = u32::from(sew.bytes_width()) * 2;
411    let elems_per_reg = VLEN.bytes() / double_sew_bytes;
412    let reg_off = u32::from(elem_i) / elems_per_reg;
413    let byte_off = (u32::from(elem_i) % elems_per_reg) * double_sew_bytes;
414    // SAFETY: caller guarantees bounds
415    let reg = unsafe {
416        vregs.get(VReg::from_bits(base_reg.to_bits() + reg_off as u8).unwrap_unchecked())
417    };
418    // SAFETY: `byte_off + double_sew_bytes <= VLEN.bytes()`
419    let src =
420        unsafe { reg.get_unchecked(byte_off as usize..(byte_off + double_sew_bytes) as usize) };
421    let mut buf = [0u8; 8];
422    // SAFETY: `double_sew_bytes <= 8` (SEW <= 32 for Zve64x narrowing)
423    unsafe { buf.get_unchecked_mut(..double_sew_bytes as usize) }.copy_from_slice(src);
424    u64::from_le_bytes(buf)
425}
426
427/// Execute a single-width fixed-point arithmetic operation that may set `vxsat`.
428///
429/// `op` receives `(vs2_elem, src_elem, sew, vxrm)` and returns `(result, saturated)`.
430/// The helper ORs any saturation flag into `vxsat` after the loop.
431///
432/// # Safety
433/// Same preconditions as `execute_arith_op` in the arithmetic helpers.
434#[inline(always)]
435#[doc(hidden)]
436// TODO: #[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
437pub unsafe fn execute_fixed_point_op<Reg, ExtState, CustomError, F>(
438    ext_state: &mut ExtState,
439    vd: VReg,
440    vs2: VReg,
441    src: OpSrc,
442    vm: bool,
443    sew: Vsew,
444    op: F,
445) where
446    Reg: Register,
447    ExtState: VectorRegistersExt<Reg, CustomError>,
448    [(); SUPPORTED_ELEN_VLEN::<{ ExtState::ELEN }, { ExtState::VLEN }>]:,
449    CustomError: fmt::Debug,
450    // op: (vs2_elem, src_elem, sew, vxrm) -> result
451    F: Fn(u64, u64, Vsew, Vxrm, &mut bool) -> u64,
452{
453    let vl = ext_state.vl();
454    let vstart = ext_state.vstart();
455    let vxrm = ext_state.vxrm();
456    // SAFETY: `vl <= VLEN`
457    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
458    let mut any_sat = false;
459    for i in vstart.range_to(vl) {
460        if !mask_bit(&mask_buf, i) {
461            continue;
462        }
463        // SAFETY: alignment and bounds checked by caller
464        let a = unsafe { read_element_u64(ext_state.read_vregs(), vs2, i, sew) };
465        let b = match src {
466            OpSrc::Vreg(vs1_base) => {
467                // SAFETY: same argument as vs2
468                unsafe { read_element_u64(ext_state.read_vregs(), vs1_base, i, sew) }
469            }
470            OpSrc::Scalar(val) => val,
471        };
472        let result = op(a, b, sew, vxrm, &mut any_sat);
473        // SAFETY: alignment and bounds checked by caller
474        unsafe {
475            write_element_u64(ext_state.write_vregs(), vd, i, sew, result);
476        }
477    }
478    if any_sat {
479        // vxsat is sticky: OR in the new saturation flag
480        ext_state.set_vxsat(true);
481    }
482    ext_state.mark_vs_dirty();
483    ext_state.reset_vstart();
484}
485
486/// Execute a narrowing fixed-point clip operation.
487///
488/// `vs2` holds a double-width register group (2x `group_regs` registers). `vd` holds the
489/// single-width destination. `src` provides the shift amount (Vreg or Scalar).
490///
491/// For Zve64x narrowing instructions, `SEW` must be at most 32 because `2*SEW` must fit in 64
492/// bits. The caller must verify this constraint before invoking this function.
493///
494/// # Safety
495/// - `sew.bits_width() <= 32` (Zve64x ELEN = 64 constraint for narrowing)
496/// - `vs2.to_bits() % (2 * group_regs) == 0` and `vs2.to_bits() + 2 * group_regs <= 32`
497/// - `vd.to_bits() % group_regs == 0` and `vd.to_bits() + group_regs <= 32`
498/// - `vl <= group_regs * VLEN.bytes() / sew_bytes`
499/// - When `vm=false`: `vd.to_bits() != 0`
500#[inline(always)]
501#[doc(hidden)]
502// TODO: #[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
503pub unsafe fn execute_narrowing_clip_op<Reg, ExtState, CustomError, F>(
504    ext_state: &mut ExtState,
505    vd: VReg,
506    vs2: VReg,
507    src: OpSrc,
508    vm: bool,
509    sew: Vsew,
510    op: F,
511) where
512    Reg: Register,
513    ExtState: VectorRegistersExt<Reg, CustomError>,
514    [(); SUPPORTED_ELEN_VLEN::<{ ExtState::ELEN }, { ExtState::VLEN }>]:,
515    CustomError: fmt::Debug,
516    // op: (vs2_wide_elem, shamt, sew, vxrm, vxsat) -> result
517    F: Fn(u64, u32, Vsew, Vxrm, &mut bool) -> u64,
518{
519    let vl = ext_state.vl();
520    let vstart = ext_state.vstart();
521    let vxrm = ext_state.vxrm();
522    // SAFETY: `vl <= VLEN`
523    let mask_buf = unsafe { snapshot_mask(ext_state.read_vregs(), vm, vl) };
524    let mut any_sat = false;
525    // Mask shift amount to log2(2*SEW) bits per spec §12.11
526    let shamt_mask = u64::from(sew.bits_width() * 2 - 1);
527    for i in vstart.range_to(vl) {
528        if !mask_bit(&mask_buf, i) {
529            continue;
530        }
531        // Read 2*SEW-wide source element
532        // SAFETY: `vs2` double-width alignment checked by caller
533        let wide_a = unsafe { read_wide_element_u64(ext_state.read_vregs(), vs2, i, sew) };
534        let shamt = match src {
535            OpSrc::Vreg(vs1_base) => {
536                // SAFETY: vs1 SEW-wide alignment checked by caller
537                let raw = unsafe { read_element_u64(ext_state.read_vregs(), vs1_base, i, sew) };
538                (raw & shamt_mask) as u32
539            }
540            OpSrc::Scalar(val) => (val & shamt_mask) as u32,
541        };
542        let result = op(wide_a, shamt, sew, vxrm, &mut any_sat);
543        // SAFETY: `vd` alignment checked by caller
544        unsafe {
545            write_element_u64(ext_state.write_vregs(), vd, i, sew, result);
546        }
547    }
548    if any_sat {
549        ext_state.set_vxsat(true);
550    }
551    ext_state.mark_vs_dirty();
552    ext_state.reset_vstart();
553}
554
555/// Verify that the destination SEW is valid for narrowing (must be at most 32 in Zve64x).
556///
557/// Returns `Err(IllegalInstruction)` when `sew.bits_width() > 32`.
558#[inline(always)]
559#[doc(hidden)]
560#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
561pub fn check_narrowing_sew<Reg, Memory, PC, CustomError>(
562    program_counter: &PC,
563    sew: Vsew,
564) -> Result<(), ExecutionError<Reg::Type, CustomError>>
565where
566    Reg: Register,
567    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
568{
569    if sew.bits_width() > 32 {
570        cold_path();
571        return Err(ExecutionError::IllegalInstruction {
572            address: program_counter.old_pc(INSTRUCTION_SIZE),
573        });
574    }
575    Ok(())
576}
577
578/// Check that the double-width source `vs2` of a narrowing instruction is aligned to its register
579/// group and fits in `[0, 32)`.
580///
581/// The source operand has `EEW = 2*SEW`, so its `EMUL = 2*LMUL`. Per v-spec §5.2 the group must be
582/// aligned to `EMUL` registers, and `EMUL` outside the legal range `[1/8, 8]` (e.g. `LMUL=8`, which
583/// would need `EMUL=16`) is reserved. Unlike `2 * register_count()`, this correctly yields a single
584/// register with no alignment constraint for fractional `LMUL` (where `2*LMUL <= 1`).
585///
586/// `sew` is the destination (narrow) SEW; it must be at most 32 (see [`check_narrowing_sew()`]).
587#[inline(always)]
588#[doc(hidden)]
589#[cfg_attr(feature = "no-panic", no_panic_const::no_panic)]
590pub fn check_vs2_narrowing_alignment<Reg, Memory, PC, CustomError>(
591    program_counter: &PC,
592    vs2: VReg,
593    vlmul: Vlmul,
594    sew: Vsew,
595) -> Result<(), ExecutionError<Reg::Type, CustomError>>
596where
597    Reg: Register,
598    PC: ProgramCounter<Reg::Type, Memory, CustomError>,
599{
600    // Source EEW is double the destination SEW. SEW=64 is rejected earlier by
601    // `check_narrowing_sew`.
602    let wide_eew = match sew {
603        Vsew::E8 => Eew::E16,
604        Vsew::E16 => Eew::E32,
605        Vsew::E32 => Eew::E64,
606        Vsew::E64 => {
607            cold_path();
608            return Err(ExecutionError::IllegalInstruction {
609                address: program_counter.old_pc(INSTRUCTION_SIZE),
610            });
611        }
612    };
613    // `EMUL = 2*LMUL`; `None` when reserved (e.g. LMUL=8 -> EMUL=16).
614    let Some(wide_group) = vlmul.data_register_count(wide_eew, sew) else {
615        cold_path();
616        return Err(ExecutionError::IllegalInstruction {
617            address: program_counter.old_pc(INSTRUCTION_SIZE),
618        });
619    };
620    let wide_group = wide_group.get();
621    let vs2_idx = vs2.to_bits();
622    if !vs2_idx.is_multiple_of(wide_group) || vs2_idx + wide_group > 32 {
623        cold_path();
624        return Err(ExecutionError::IllegalInstruction {
625            address: program_counter.old_pc(INSTRUCTION_SIZE),
626        });
627    }
628    Ok(())
629}
ab_riscv_interpreter/v/zvexx/fixed_point/zvexx_fixed_point_helpers.rs

ab_riscv_interpreter/v/zvexx/fixed_point/
zvexx_fixed_point_helpers.rs