Skip to main content

zinc_poly/univariate/
binary_u64.rs

1use crate::{
2    ConstCoeffBitWidth, EvaluatablePolynomial, EvaluationError, Polynomial,
3    univariate::{dense::DensePolynomial, prepare_projection},
4};
5use core::mem::MaybeUninit;
6use crypto_primitives::{PrimeField, Semiring, semiring::boolean::Boolean};
7use derive_more::{AsRef, Display};
8use num_traits::{CheckedAdd, CheckedMul, CheckedSub, One, Zero};
9use rand::{distr::StandardUniform, prelude::*};
10use std::{
11    array,
12    hash::Hash,
13    iter::{Product, Sum},
14    marker::PhantomData,
15    ops::{Add, AddAssign, Mul, MulAssign, Sub, SubAssign},
16};
17use zinc_transcript::delegate_const_transcribable;
18use zinc_utils::{
19    from_ref::FromRef,
20    inner_product::{InnerProduct, InnerProductError},
21    mul_by_scalar::MulByScalar,
22    named::Named,
23    projectable_to_field::ProjectableToField,
24};
25
26#[derive(AsRef, Clone, Debug, Default, Display, Hash, PartialEq, Eq)]
27#[repr(transparent)]
28pub struct BinaryU64Poly<const DEGREE_PLUS_ONE: usize>(u64); // we can fit up to degree 64, which is ok for now
29
30impl<const DEGREE_PLUS_ONE: usize> BinaryU64Poly<DEGREE_PLUS_ONE> {
31    const _DEGREE_CHECK: () = {
32        // Note that associated constants on generic types are evaluated lazily, so we
33        // have to "access" this to trigger the assertion.
34        assert!(
35            DEGREE_PLUS_ONE <= 64,
36            "For BinaryU64Poly, degree cannot exceed 63"
37        );
38    };
39
40    #[inline(always)]
41    #[allow(clippy::let_unit_value)]
42    pub const fn inner(&self) -> &u64 {
43        let _ = Self::_DEGREE_CHECK; // Force lazy evaluation of constraint
44        &self.0
45    }
46}
47
48impl<const DEGREE_PLUS_ONE: usize> From<BinaryU64Poly<DEGREE_PLUS_ONE>> for u64 {
49    #[inline(always)]
50    fn from(binary_poly: BinaryU64Poly<DEGREE_PLUS_ONE>) -> Self {
51        binary_poly.0
52    }
53}
54
55impl<const DEGREE_PLUS_ONE: usize> From<BinaryU64Poly<DEGREE_PLUS_ONE>>
56    for DensePolynomial<Boolean, DEGREE_PLUS_ONE>
57{
58    #[inline(always)]
59    fn from(binary_poly: BinaryU64Poly<DEGREE_PLUS_ONE>) -> Self {
60        DensePolynomial {
61            coeffs: array::from_fn(|i| Boolean::new(!(binary_poly.0 & (1 << i)).is_zero())),
62        }
63    }
64}
65
66impl<const DEGREE_PLUS_ONE: usize> From<u32> for BinaryU64Poly<DEGREE_PLUS_ONE> {
67    #[inline(always)]
68    fn from(value: u32) -> Self {
69        // Keep masking enforced in a single place
70        Self::from(u64::from(value))
71    }
72}
73
74impl<const DEGREE_PLUS_ONE: usize> From<u64> for BinaryU64Poly<DEGREE_PLUS_ONE> {
75    #[inline(always)]
76    #[allow(clippy::arithmetic_side_effects, clippy::let_unit_value)]
77    fn from(value: u64) -> Self {
78        let _ = Self::_DEGREE_CHECK; // Force lazy evaluation of constraint
79
80        // Bit `i` of `value` becomes the coefficient of `X^i`.
81        if DEGREE_PLUS_ONE == 64 {
82            Self(value)
83        } else {
84            // Bits at positions `>= DEGREE_PLUS_ONE` are masked off so that no bits beyond
85            // the polynomial degree are set. This is important, as downstream
86            // code (such as `BinaryU64PolyInnerProduct::inner_product`) relies on it.
87            Self(value & ((1_u64 << DEGREE_PLUS_ONE) - 1))
88        }
89    }
90}
91
92impl<const DEGREE_PLUS_ONE: usize> BinaryU64Poly<DEGREE_PLUS_ONE> {
93    /// Create a new polynomial with the given coefficients.
94    /// If the input has fewer than N+1 coefficients, the remaining slots will
95    /// be filled with zeros. If the input has more than N+1 coefficients,
96    /// it will panic.
97    #[inline(always)]
98    pub fn new(coeffs: impl AsRef<[Boolean]>) -> Self {
99        // Self(DensePolynomial::new(coeffs))
100        let coeffs = coeffs.as_ref();
101        assert!(coeffs.len() <= DEGREE_PLUS_ONE);
102        let mut value: u64 = 0;
103        for (i, coeff) in coeffs.iter().enumerate() {
104            if coeff.inner() {
105                value |= 1 << i;
106            }
107        }
108        Self(value)
109    }
110
111    /// Create a new polynomial with the given coefficients.
112    /// If the input has fewer than N+1 coefficients, the remaining slots will
113    /// be filled with zeros. If the input has more than N+1 coefficients,
114    /// it will panic.
115    #[inline(always)]
116    pub fn new_padded(coeffs: impl AsRef<[Boolean]>) -> Self {
117        let coeffs = coeffs.as_ref();
118        assert!(coeffs.len() <= DEGREE_PLUS_ONE);
119        let mut value: u64 = 0;
120        for (i, coeff) in coeffs.iter().enumerate() {
121            if coeff.inner() {
122                value |= 1 << i;
123            }
124        }
125        Self(value)
126    }
127}
128
129impl<const DEGREE_PLUS_ONE: usize> Zero for BinaryU64Poly<DEGREE_PLUS_ONE> {
130    #[inline(always)]
131    fn zero() -> Self {
132        Self(0)
133    }
134
135    #[inline(always)]
136    fn is_zero(&self) -> bool {
137        self.0.is_zero()
138    }
139}
140
141impl<const DEGREE_PLUS_ONE: usize> One for BinaryU64Poly<DEGREE_PLUS_ONE> {
142    #[inline(always)]
143    fn one() -> Self {
144        Self(1)
145    }
146}
147
148impl<'a, const DEGREE_PLUS_ONE: usize> Add<&'a Self> for BinaryU64Poly<DEGREE_PLUS_ONE> {
149    type Output = Self;
150
151    #[allow(clippy::arithmetic_side_effects, clippy::suspicious_arithmetic_impl)]
152    #[inline(always)]
153    fn add(self, rhs: &'a Self) -> Self::Output {
154        // addition in GF(2) is XOR
155        Self(self.0 ^ rhs.0)
156    }
157}
158
159impl<const DEGREE_PLUS_ONE: usize> Add for BinaryU64Poly<DEGREE_PLUS_ONE> {
160    type Output = Self;
161
162    #[allow(clippy::arithmetic_side_effects)]
163    #[inline(always)]
164    fn add(self, rhs: Self) -> Self::Output {
165        self.add(&rhs)
166    }
167}
168
169impl<'a, const DEGREE_PLUS_ONE: usize> Sub<&'a Self> for BinaryU64Poly<DEGREE_PLUS_ONE> {
170    type Output = Self;
171
172    #[allow(clippy::arithmetic_side_effects, clippy::suspicious_arithmetic_impl)]
173    #[inline(always)]
174    fn sub(self, rhs: &'a Self) -> Self::Output {
175        // subtraction in GF(2) is XOR
176        Self(self.0 ^ rhs.0)
177    }
178}
179
180impl<const DEGREE_PLUS_ONE: usize> Sub for BinaryU64Poly<DEGREE_PLUS_ONE> {
181    type Output = Self;
182
183    #[allow(clippy::arithmetic_side_effects)]
184    #[inline(always)]
185    fn sub(self, rhs: Self) -> Self::Output {
186        self.sub(&rhs)
187    }
188}
189
190impl<const DEGREE_PLUS_ONE: usize> Mul for BinaryU64Poly<DEGREE_PLUS_ONE> {
191    type Output = Self;
192
193    #[allow(clippy::arithmetic_side_effects)]
194    #[inline(always)]
195    fn mul(self, rhs: Self) -> Self::Output {
196        self.mul(&rhs)
197    }
198}
199
200impl<'a, const DEGREE_PLUS_ONE: usize> Mul<&'a Self> for BinaryU64Poly<DEGREE_PLUS_ONE> {
201    type Output = Self;
202
203    #[allow(clippy::arithmetic_side_effects)]
204    #[inline(always)]
205    fn mul(self, _rhs: &'a Self) -> Self::Output {
206        unimplemented!("Multiplication for BinaryU64Poly is not implemented yet");
207    }
208}
209
210#[allow(clippy::suspicious_op_assign_impl)]
211impl<'a, const DEGREE_PLUS_ONE: usize> AddAssign<&'a Self> for BinaryU64Poly<DEGREE_PLUS_ONE> {
212    #[inline(always)]
213    fn add_assign(&mut self, rhs: &'a Self) {
214        // addition in GF(2) is XOR
215        self.0 ^= rhs.0;
216    }
217}
218
219impl<const DEGREE_PLUS_ONE: usize> AddAssign for BinaryU64Poly<DEGREE_PLUS_ONE> {
220    #[inline(always)]
221    fn add_assign(&mut self, rhs: Self) {
222        self.add_assign(&rhs);
223    }
224}
225
226#[allow(clippy::suspicious_op_assign_impl)]
227impl<'a, const DEGREE_PLUS_ONE: usize> SubAssign<&'a Self> for BinaryU64Poly<DEGREE_PLUS_ONE> {
228    #[inline(always)]
229    fn sub_assign(&mut self, rhs: &'a Self) {
230        // subtraction in GF(2) is XOR
231        self.0 ^= rhs.0;
232    }
233}
234
235impl<const DEGREE_PLUS_ONE: usize> SubAssign for BinaryU64Poly<DEGREE_PLUS_ONE> {
236    #[inline(always)]
237    fn sub_assign(&mut self, rhs: Self) {
238        self.sub_assign(&rhs);
239    }
240}
241
242impl<const DEGREE_PLUS_ONE: usize> MulAssign for BinaryU64Poly<DEGREE_PLUS_ONE> {
243    #[inline(always)]
244    fn mul_assign(&mut self, _rhs: Self) {
245        unimplemented!("Multiplication for BinaryU64Poly is not implemented yet");
246    }
247}
248
249impl<'a, const DEGREE_PLUS_ONE: usize> MulAssign<&'a Self> for BinaryU64Poly<DEGREE_PLUS_ONE> {
250    #[inline(always)]
251    fn mul_assign(&mut self, _rhs: &'a Self) {
252        unimplemented!("Multiplication for BinaryU64Poly is not implemented yet");
253        // self.0.mul_assign(&rhs.0);
254    }
255}
256
257impl<const DEGREE_PLUS_ONE: usize> CheckedAdd for BinaryU64Poly<DEGREE_PLUS_ONE> {
258    #[inline(always)]
259    fn checked_add(&self, other: &Self) -> Option<Self> {
260        // If any bit is set in both operands, addition would overflow (1+1)
261        if (self.0 & other.0) != 0 {
262            None
263        } else {
264            // addition in GF(2) is XOR
265            Some(Self(self.0 ^ other.0))
266        }
267    }
268}
269
270impl<const DEGREE_PLUS_ONE: usize> CheckedSub for BinaryU64Poly<DEGREE_PLUS_ONE> {
271    #[inline(always)]
272    fn checked_sub(&self, other: &Self) -> Option<Self> {
273        // If any bit is 0 in self and 1 in other, subtraction would overflow (0-1)
274        if ((!self.0) & other.0) != 0 {
275            // There exists a bit where self is 0 and other is 1
276            None
277        } else {
278            // subtraction in GF(2) is XOR
279            Some(Self(self.0 ^ other.0))
280        }
281    }
282}
283
284impl<const DEGREE_PLUS_ONE: usize> CheckedMul for BinaryU64Poly<DEGREE_PLUS_ONE> {
285    #[inline(always)]
286    fn checked_mul(&self, _other: &Self) -> Option<Self> {
287        unimplemented!("Checked multiplication for BinaryU64Poly is not implemented yet");
288    }
289}
290
291impl<'a, const DEGREE_PLUS_ONE: usize> Sum<&'a Self> for BinaryU64Poly<DEGREE_PLUS_ONE> {
292    #[inline(always)]
293    fn sum<I: Iterator<Item = &'a Self>>(iter: I) -> Self {
294        iter.sum()
295    }
296}
297
298impl<'a, const DEGREE_PLUS_ONE: usize> Product<&'a Self> for BinaryU64Poly<DEGREE_PLUS_ONE> {
299    #[inline(always)]
300    fn product<I: Iterator<Item = &'a Self>>(iter: I) -> Self {
301        iter.product()
302    }
303}
304
305impl<const DEGREE_PLUS_ONE: usize> Semiring for BinaryU64Poly<DEGREE_PLUS_ONE> {}
306
307impl<const DEGREE_PLUS_ONE: usize> Distribution<BinaryU64Poly<DEGREE_PLUS_ONE>>
308    for StandardUniform
309{
310    #[inline(always)]
311    #[allow(clippy::arithmetic_side_effects)]
312    fn sample<Gen: Rng + ?Sized>(&self, rng: &mut Gen) -> BinaryU64Poly<DEGREE_PLUS_ONE> {
313        let mut value: u64 = rng.next_u64();
314        if DEGREE_PLUS_ONE < 64 {
315            value &= (1u64 << DEGREE_PLUS_ONE) - 1;
316        }
317        BinaryU64Poly::<DEGREE_PLUS_ONE>(value)
318    }
319}
320
321//
322// Zip-specific traits
323//
324impl<const DEGREE_PLUS_ONE: usize> Polynomial<Boolean> for BinaryU64Poly<DEGREE_PLUS_ONE> {
325    const DEGREE_BOUND: usize = DensePolynomial::<Boolean, DEGREE_PLUS_ONE>::DEGREE_BOUND;
326}
327
328#[allow(clippy::arithmetic_side_effects)]
329impl<R: Clone + Zero + One + CheckedAdd + CheckedMul, const DEGREE_PLUS_ONE: usize>
330    EvaluatablePolynomial<Boolean, R> for BinaryU64Poly<DEGREE_PLUS_ONE>
331{
332    type EvaluationPoint = R;
333
334    fn evaluate_at_point(&self, point: &R) -> Result<R, EvaluationError> {
335        if DEGREE_PLUS_ONE.is_one() {
336            return Ok(R::zero());
337        }
338
339        let mut result = R::zero();
340        let mut pow = R::one();
341        for i in 0..DEGREE_PLUS_ONE {
342            if (self.0 & (1 << i)) != 0 {
343                result = result.checked_add(&pow).ok_or(EvaluationError::Overflow)?;
344            }
345            if i + 1 < DEGREE_PLUS_ONE {
346                pow = pow.checked_mul(point).ok_or(EvaluationError::Overflow)?;
347            }
348        }
349
350        Ok(result)
351    }
352}
353
354impl<const DEGREE_PLUS_ONE: usize> ConstCoeffBitWidth for BinaryU64Poly<DEGREE_PLUS_ONE> {
355    const COEFF_BIT_WIDTH: usize = DensePolynomial::<Boolean, DEGREE_PLUS_ONE>::COEFF_BIT_WIDTH;
356}
357
358impl<const DEGREE_PLUS_ONE: usize> Named for BinaryU64Poly<DEGREE_PLUS_ONE> {
359    fn type_name() -> String {
360        format!("BPoly<{}>", Self::DEGREE_BOUND)
361    }
362}
363
364delegate_const_transcribable!(BinaryU64Poly<const DEGREE_PLUS_ONE: usize>(u64));
365
366impl<const DEGREE_PLUS_ONE: usize> FromRef<BinaryU64Poly<DEGREE_PLUS_ONE>>
367    for BinaryU64Poly<DEGREE_PLUS_ONE>
368{
369    #[inline(always)]
370    fn from_ref(poly: &BinaryU64Poly<DEGREE_PLUS_ONE>) -> Self {
371        poly.clone()
372    }
373}
374
375impl<const DEGREE_PLUS_ONE: usize> From<&BinaryU64Poly<DEGREE_PLUS_ONE>>
376    for BinaryU64Poly<DEGREE_PLUS_ONE>
377{
378    #[inline(always)]
379    fn from(value: &BinaryU64Poly<DEGREE_PLUS_ONE>) -> Self {
380        Self::from_ref(value)
381    }
382}
383
384pub struct BinaryU64PolyIter<'a, const DEGREE_PLUS_ONE: usize> {
385    i: usize,
386    poly: &'a BinaryU64Poly<DEGREE_PLUS_ONE>,
387}
388
389impl<'a, const DEGREE_PLUS_ONE: usize> BinaryU64PolyIter<'a, DEGREE_PLUS_ONE> {
390    pub fn new(poly: &'a BinaryU64Poly<DEGREE_PLUS_ONE>) -> Self {
391        Self { i: 0, poly }
392    }
393}
394
395impl<'a, const DEGREE_PLUS_ONE: usize> Iterator for BinaryU64PolyIter<'a, DEGREE_PLUS_ONE> {
396    type Item = Boolean;
397
398    #[allow(clippy::arithmetic_side_effects)]
399    fn next(&mut self) -> Option<Self::Item> {
400        if self.i < DEGREE_PLUS_ONE {
401            let i = self.i;
402
403            self.i += 1;
404
405            Some((!(self.poly.0 & (1 << i)).is_zero()).into())
406        } else {
407            None
408        }
409    }
410}
411
412impl<const DEGREE_PLUS_ONE: usize> BinaryU64Poly<DEGREE_PLUS_ONE> {
413    pub fn iter(&self) -> BinaryU64PolyIter<'_, DEGREE_PLUS_ONE> {
414        BinaryU64PolyIter::new(self)
415    }
416}
417
418#[derive(Clone, Debug)]
419pub struct BinaryU64PolyInnerProduct<R, const DEGREE_PLUS_ONE: usize>(PhantomData<R>);
420
421impl<Rhs, Out, const DEGREE_PLUS_ONE: usize> InnerProduct<BinaryU64Poly<DEGREE_PLUS_ONE>, Rhs, Out>
422    for BinaryU64PolyInnerProduct<Rhs, DEGREE_PLUS_ONE>
423where
424    Rhs: Clone,
425    Out: FromRef<Rhs> + CheckedAdd,
426{
427    #[inline(always)]
428    #[allow(clippy::arithmetic_side_effects)] // By design
429    fn inner_product<const CHECK: bool>(
430        lhs: &BinaryU64Poly<DEGREE_PLUS_ONE>,
431        rhs: &[Rhs],
432        zero: Out,
433    ) -> Result<Out, InnerProductError> {
434        if rhs.len() != DEGREE_PLUS_ONE {
435            return Err(InnerProductError::LengthMismatch {
436                lhs: DEGREE_PLUS_ONE,
437                rhs: rhs.len(),
438            });
439        }
440
441        let mut acc = zero;
442        let mut bits = lhs.0;
443        while bits != 0 {
444            let i = bits.trailing_zeros() as usize;
445            let rhs = Out::from_ref(&rhs[i]);
446            if CHECK {
447                acc = acc.checked_add(&rhs).ok_or(InnerProductError::Overflow)?;
448            } else {
449                acc = acc + rhs;
450            }
451            // changes the LSB 1 bit to 0
452            bits &= bits - 1;
453        }
454
455        Ok(acc)
456    }
457}
458
459impl<F, const DEGREE_PLUS_ONE: usize> ProjectableToField<F> for BinaryU64Poly<DEGREE_PLUS_ONE>
460where
461    F: PrimeField + FromRef<F> + 'static,
462{
463    fn prepare_projection(sampled_value: &F) -> impl Fn(&Self) -> F + 'static {
464        prepare_projection::<F, Self, _, DEGREE_PLUS_ONE>(sampled_value, |poly, i| {
465            (poly.0 & (1 << i)) != 0
466        })
467    }
468}
469
470impl<const DEGREE_PLUS_ONE: usize> MulByScalar<&i64, DensePolynomial<i64, DEGREE_PLUS_ONE>>
471    for BinaryU64Poly<DEGREE_PLUS_ONE>
472{
473    fn mul_by_scalar<const CHECK: bool>(
474        &self,
475        rhs: &i64,
476    ) -> Option<DensePolynomial<i64, DEGREE_PLUS_ONE>> {
477        Some(widen_simd::<DEGREE_PLUS_ONE>(self, *rhs))
478    }
479}
480
481#[allow(unreachable_code, unused_variables)] // CI system does not support SIMD features
482#[inline(always)]
483pub fn widen_simd<const DEGREE_PLUS_ONE: usize>(
484    poly: &BinaryU64Poly<DEGREE_PLUS_ONE>,
485    scalar: i64,
486) -> DensePolynomial<i64, DEGREE_PLUS_ONE> {
487    let mut coeffs_uninit = MaybeUninit::<[i64; DEGREE_PLUS_ONE]>::uninit();
488    let out_ptr = coeffs_uninit.as_mut_ptr() as *mut i64;
489
490    #[cfg(target_arch = "aarch64")]
491    unsafe {
492        widen_fill_neon::<DEGREE_PLUS_ONE>(&poly.0, out_ptr, scalar);
493    }
494    #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
495    unsafe {
496        widen_fill_avx512::<DEGREE_PLUS_ONE>(&poly.0, out_ptr, scalar);
497    }
498    #[cfg(not(any(
499        target_arch = "aarch64",
500        all(target_arch = "x86_64", target_feature = "avx512f")
501    )))]
502    {
503        panic!("SIMD widening not supported on this architecture");
504    }
505
506    let coeffs = unsafe { coeffs_uninit.assume_init() };
507    DensePolynomial { coeffs }
508}
509
510#[allow(
511    clippy::arithmetic_side_effects,
512    clippy::cast_possible_truncation,
513    clippy::cast_lossless
514)]
515#[allow(unsafe_op_in_unsafe_fn)]
516#[cfg(target_arch = "aarch64")]
517#[inline(always)]
518// Converts a u64 bitmask into an array of i64 values using ARM NEON SIMD.
519// Processes 8 bits at a time with ~8-12× speedup vs scalar code.
520unsafe fn widen_fill_neon<const N: usize>(mask_ref: &u64, out_ptr: *mut i64, scalar: i64) {
521    use core::arch::aarch64::*;
522
523    let mask64: u64 = *mask_ref;
524
525    // Replicate scalar across SIMD lanes for branchless selection
526    let scalar_v: int64x2_t = vdupq_n_s64(scalar);
527
528    let mut i = 0usize;
529
530    // For N == 64, use lookup table (16KB). For smaller N, use SIMD widening
531    if N == 64 {
532        // Precomputed lookup table: maps each byte value (0-255) directly to 8 i64
533        // values Each bit in the byte produces -1 (if set) or 0 (if clear)
534        // Table size: 256 entries × 8 i64 × 8 bytes = 16KB
535        #[repr(align(64))]
536        struct LookupTable([[i64; 8]; 256]);
537
538        static LUT: LookupTable = {
539            let mut table = [[0i64; 8]; 256];
540            let mut i = 0;
541            while i < 256 {
542                let mut bit = 0;
543                while bit < 8 {
544                    table[i][bit] = if (i & (1 << bit)) != 0 { -1 } else { 0 };
545                    bit += 1;
546                }
547                i += 1;
548            }
549            LookupTable(table)
550        };
551
552        // Main loop: process 8 coefficients per iteration
553        while i + 8 <= N {
554            let shift = i as u32;
555            let byte: u8 = ((mask64 >> shift) & 0xFF) as u8;
556
557            // Direct table lookup: load 8 i64 values (4 int64x2_t vectors)
558            let masks_ptr = LUT.0[byte as usize].as_ptr();
559            let m0: int64x2_t = vld1q_s64(masks_ptr);
560            let m1: int64x2_t = vld1q_s64(masks_ptr.add(2));
561            let m2: int64x2_t = vld1q_s64(masks_ptr.add(4));
562            let m3: int64x2_t = vld1q_s64(masks_ptr.add(6));
563
564            // Branchless select: scalar & -1 = scalar, scalar & 0 = 0
565            vst1q_s64(out_ptr.add(i), vandq_s64(scalar_v, m0));
566            vst1q_s64(out_ptr.add(i + 2), vandq_s64(scalar_v, m1));
567            vst1q_s64(out_ptr.add(i + 4), vandq_s64(scalar_v, m2));
568            vst1q_s64(out_ptr.add(i + 6), vandq_s64(scalar_v, m3));
569
570            i += 8;
571        }
572    } else {
573        // Compile-time constant for bit masks [1,2,4,8,16,32,64,128]
574        const BIT_MASKS: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128];
575        let bit_masks: uint8x8_t = vld1_u8(BIT_MASKS.as_ptr());
576        let zero_u8: uint8x8_t = vdup_n_u8(0);
577
578        // Main loop: process 8 coefficients per iteration
579        while i + 8 <= N {
580            let shift = i as u32;
581            let byte: u8 = ((mask64 >> shift) & 0xFF) as u8;
582
583            // Extract bits: replicate byte and AND with bit masks to isolate each bit
584            let vbyte: uint8x8_t = vdup_n_u8(byte);
585            let selected: uint8x8_t = vand_u8(vbyte, bit_masks);
586            let nz: uint8x8_t = vcgt_u8(selected, zero_u8); // 0xFF if bit set, 0x00 if not
587
588            // Sign-extend 0xFF → 0xFFFF...FFFF, 0x00 → 0x0000...0000 via signed widening
589            let nz_signed: int8x8_t = vreinterpret_s8_u8(nz); // Reinterpret: 0xFF = -1, 0x00 = 0
590            let s16: int16x8_t = vmovl_s8(nz_signed); // -1 → 0xFFFF, 0 → 0x0000
591            let lo32: int32x4_t = vmovl_s16(vget_low_s16(s16));
592            let hi32: int32x4_t = vmovl_s16(vget_high_s16(s16));
593
594            let m0: int64x2_t = vmovl_s32(vget_low_s32(lo32)); // -1 → 0xFFFF...FFFF
595            let m1: int64x2_t = vmovl_s32(vget_high_s32(lo32));
596            let m2: int64x2_t = vmovl_s32(vget_low_s32(hi32));
597            let m3: int64x2_t = vmovl_s32(vget_high_s32(hi32));
598
599            // Branchless select: scalar & 0xFFFF... = scalar, scalar & 0x0000... = 0
600            vst1q_s64(out_ptr.add(i), vandq_s64(scalar_v, m0));
601            vst1q_s64(out_ptr.add(i + 2), vandq_s64(scalar_v, m1));
602            vst1q_s64(out_ptr.add(i + 4), vandq_s64(scalar_v, m2));
603            vst1q_s64(out_ptr.add(i + 6), vandq_s64(scalar_v, m3));
604
605            i += 8;
606        }
607    }
608
609    // Tail: handle remaining coefficients one at a time
610    while i < N {
611        let bit = ((mask64 >> i) & 1) != 0;
612        let mask = -(bit as i64); // 0 or -1 (all bits set)
613        *out_ptr.add(i) = scalar & mask;
614        i += 1;
615    }
616}
617
618#[allow(
619    clippy::arithmetic_side_effects,
620    clippy::cast_possible_truncation,
621    clippy::cast_lossless
622)]
623#[allow(unsafe_op_in_unsafe_fn)]
624#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
625#[inline(always)]
626// Converts a u64 bitmask into an array of i64 values using AVX512 SIMD.
627// Processes 32 bits at a time with 4-way unrolling for instruction-level
628// parallelism. Significantly cleaner than NEON due to native mask register
629// support.
630unsafe fn widen_fill_avx512<const N: usize>(mask_ref: &u64, out_ptr: *mut i64, scalar: i64) {
631    #[cfg(target_arch = "x86_64")]
632    use core::arch::x86_64::*;
633
634    let mask64: u64 = *mask_ref;
635    let mut i = 0usize;
636
637    // Unrolled loop: process 32 coefficients (4 x 8) per iteration for ILP
638    // Interleave independent operations to keep multiple execution units busy
639    while i + 32 <= N {
640        let shift = i as u32;
641
642        // Extract all 4 bytes at once - no dependencies between extractions
643        let byte0: u8 = ((mask64 >> shift) & 0xFF) as u8;
644        let byte1: u8 = ((mask64 >> (shift + 8)) & 0xFF) as u8;
645        let byte2: u8 = ((mask64 >> (shift + 16)) & 0xFF) as u8;
646        let byte3: u8 = ((mask64 >> (shift + 24)) & 0xFF) as u8;
647
648        // Convert to mask registers - independent operations
649        let kmask0: __mmask8 = byte0;
650        let kmask1: __mmask8 = byte1;
651        let kmask2: __mmask8 = byte2;
652        let kmask3: __mmask8 = byte3;
653
654        // Predicated broadcasts - all can execute in parallel on different ports
655        let result0: __m512i = _mm512_maskz_set1_epi64(kmask0, scalar);
656        let result1: __m512i = _mm512_maskz_set1_epi64(kmask1, scalar);
657        let result2: __m512i = _mm512_maskz_set1_epi64(kmask2, scalar);
658        let result3: __m512i = _mm512_maskz_set1_epi64(kmask3, scalar);
659
660        // Stores - can pipeline as they have no dependencies on each other
661        _mm512_storeu_si512(out_ptr.add(i) as *mut __m512i, result0);
662        _mm512_storeu_si512(out_ptr.add(i + 8) as *mut __m512i, result1);
663        _mm512_storeu_si512(out_ptr.add(i + 16) as *mut __m512i, result2);
664        _mm512_storeu_si512(out_ptr.add(i + 24) as *mut __m512i, result3);
665
666        i += 32;
667    }
668
669    // Handle remaining full vectors (8 coefficients at a time)
670    while i + 8 <= N {
671        let shift = i as u32;
672        let byte: u8 = ((mask64 >> shift) & 0xFF) as u8;
673        let kmask: __mmask8 = byte;
674        let result: __m512i = _mm512_maskz_set1_epi64(kmask, scalar);
675        _mm512_storeu_si512(out_ptr.add(i) as *mut __m512i, result);
676        i += 8;
677    }
678
679    // Tail: handle remaining coefficients one at a time
680    while i < N {
681        let bit = ((mask64 >> i) & 1) != 0;
682        let mask = -(bit as i64); // 0 or -1 (all bits set)
683        *out_ptr.add(i) = scalar & mask;
684        i += 1;
685    }
686}
687
688#[cfg(test)]
689mod tests {
690    use crate::univariate::binary_ref::BinaryRefPoly;
691    use zinc_transcript::traits::{ConstTranscribable, GenTranscribable};
692    use zinc_utils::CHECKED;
693
694    use super::*;
695
696    #[test]
697    fn evaluate_is_correct() {
698        for i in 0..16 {
699            let poly = BinaryU64Poly::<4>::new([
700                (i & 0b0001 != 0).into(),
701                (i & 0b0010 != 0).into(),
702                (i & 0b0100 != 0).into(),
703                (i & 0b1000 != 0).into(),
704            ]);
705
706            let result = poly.evaluate_at_point(&2).unwrap();
707
708            assert_eq!(result, i);
709        }
710    }
711
712    #[test]
713    fn transcribable() {
714        let original =
715            BinaryU64Poly::<4>::new([true.into(), false.into(), true.into(), false.into()]);
716        let mut bytes = [0u8; BinaryU64Poly::<4>::NUM_BYTES];
717        assert_eq!(bytes.len(), u64::NUM_BYTES);
718
719        original.write_transcription_bytes_exact(&mut bytes);
720        let deserialized = BinaryU64Poly::<4>::read_transcription_bytes_exact(&bytes);
721        assert_eq!(original, deserialized);
722    }
723
724    fn widen_ref<const DEGREE_PLUS_ONE: usize>(
725        poly: &BinaryU64Poly<DEGREE_PLUS_ONE>,
726        scalar: i64,
727    ) -> DensePolynomial<i64, DEGREE_PLUS_ONE> {
728        let mut coeffs: [i64; DEGREE_PLUS_ONE] = [0; DEGREE_PLUS_ONE];
729        for (i, coeff) in coeffs.iter_mut().enumerate().take(DEGREE_PLUS_ONE) {
730            if (poly.0 & (1 << i)) != 0 {
731                *coeff = scalar;
732            }
733        }
734        DensePolynomial { coeffs }
735    }
736
737    #[ignore = "CI system does not support SIMD features"]
738    #[test]
739    fn widen_ref_and_widen_ref_simd_match() {
740        // Test with degree 4
741        for i in 0..16 {
742            let poly = BinaryU64Poly::<4>::new([
743                (i & 0b0001 != 0).into(),
744                (i & 0b0010 != 0).into(),
745                (i & 0b0100 != 0).into(),
746                (i & 0b1000 != 0).into(),
747            ]);
748
749            let poly_ref = BinaryRefPoly::<4>::new([
750                (i & 0b0001 != 0).into(),
751                (i & 0b0010 != 0).into(),
752                (i & 0b0100 != 0).into(),
753                (i & 0b1000 != 0).into(),
754            ]);
755
756            for scalar in [1, 42, -7, 100, -100, i64::MAX, i64::MIN] {
757                let result_simd_ref = widen_ref(&poly, scalar);
758                let result_simd = widen_simd(&poly, scalar);
759                let result_ref = poly_ref.mul_by_scalar::<CHECKED>(&scalar).unwrap();
760
761                assert_eq!(
762                    result_simd_ref.coeffs, result_simd.coeffs,
763                    "Mismatch for pattern {} with scalar {}",
764                    i, scalar
765                );
766                assert_eq!(
767                    result_simd_ref.coeffs, result_ref.coeffs,
768                    "Mismatch for pattern {} with scalar {} between ref and MulByScalar",
769                    i, scalar
770                );
771            }
772        }
773
774        let coeffs: Vec<_> = [
775            1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
776            0, 1, 0,
777        ]
778        .into_iter()
779        .map(|x| (x != 0).into())
780        .collect();
781
782        let poly32 = BinaryU64Poly::<32>::new(coeffs.clone());
783        let poly32_ref = BinaryRefPoly::<32>::new(coeffs);
784
785        for scalar in [1, 42, -7, 100, -100] {
786            let result_simd_ref = widen_ref(&poly32, scalar);
787            let result_simd = widen_simd(&poly32, scalar);
788            let result_ref = poly32_ref.mul_by_scalar::<CHECKED>(&scalar).unwrap();
789            assert_eq!(
790                result_simd_ref.coeffs, result_simd.coeffs,
791                "Mismatch for degree 32 with scalar {}",
792                scalar
793            );
794            assert_eq!(
795                result_simd_ref.coeffs, result_ref.coeffs,
796                "Mismatch for degree 32 with scalar {} between ref and MulByScalar",
797                scalar
798            );
799        }
800
801        // Test with all zeros
802        let poly_zeros = BinaryU64Poly::<16>::new([false.into(); 16]);
803        let result_ref_zeros = widen_ref(&poly_zeros, 42);
804        let result_simd_zeros = widen_simd(&poly_zeros, 42);
805        assert_eq!(result_ref_zeros.coeffs, result_simd_zeros.coeffs);
806
807        // Test with all ones
808        let poly_ones = BinaryU64Poly::<16>::new([true.into(); 16]);
809        let result_ref_ones = widen_ref(&poly_ones, 42);
810        let result_simd_ones = widen_simd(&poly_ones, 42);
811        assert_eq!(result_ref_ones.coeffs, result_simd_ones.coeffs);
812    }
813}