Skip to main content

zinc_poly/univariate/
binary_u64.rs

1use crate::{
2    ConstCoeffBitWidth, EvaluatablePolynomial, EvaluationError, Polynomial,
3    univariate::{dense::DensePolynomial, prepare_projection},
4};
5use core::mem::MaybeUninit;
6use crypto_primitives::{PrimeField, Semiring, semiring::boolean::Boolean};
7use derive_more::{AsRef, Display};
8use num_traits::{CheckedAdd, CheckedMul, CheckedSub, One, Zero};
9use rand::{distr::StandardUniform, prelude::*};
10use std::{
11    array,
12    hash::Hash,
13    iter::{Product, Sum},
14    marker::PhantomData,
15    ops::{Add, AddAssign, Mul, MulAssign, Sub, SubAssign},
16};
17use zinc_transcript::delegate_const_transcribable;
18use zinc_utils::{
19    from_ref::FromRef,
20    inner_product::{InnerProduct, InnerProductError},
21    mul_by_scalar::MulByScalar,
22    named::Named,
23    projectable_to_field::ProjectableToField,
24};
25
26#[derive(AsRef, Clone, Debug, Default, Display, Hash, PartialEq, Eq)]
27#[repr(transparent)]
28pub struct BinaryU64Poly<const DEGREE_PLUS_ONE: usize>(u64); // we can fit up to degree 64, which is ok for now
29
30impl<const DEGREE_PLUS_ONE: usize> BinaryU64Poly<DEGREE_PLUS_ONE> {
31    #[inline(always)]
32    pub const fn inner(&self) -> &u64 {
33        &self.0
34    }
35}
36
37impl<const DEGREE_PLUS_ONE: usize> From<BinaryU64Poly<DEGREE_PLUS_ONE>> for u64 {
38    #[inline(always)]
39    fn from(binary_poly: BinaryU64Poly<DEGREE_PLUS_ONE>) -> Self {
40        binary_poly.0
41    }
42}
43
44impl<const DEGREE_PLUS_ONE: usize> From<BinaryU64Poly<DEGREE_PLUS_ONE>>
45    for DensePolynomial<Boolean, DEGREE_PLUS_ONE>
46{
47    #[inline(always)]
48    fn from(binary_poly: BinaryU64Poly<DEGREE_PLUS_ONE>) -> Self {
49        DensePolynomial {
50            coeffs: array::from_fn(|i| Boolean::new(!(binary_poly.0 & (1 << i)).is_zero())),
51        }
52    }
53}
54
55impl From<u32> for BinaryU64Poly<32> {
56    fn from(value: u32) -> Self {
57        Self(u64::from(value)) // we ignore upper bits
58    }
59}
60
61impl From<u64> for BinaryU64Poly<64> {
62    fn from(value: u64) -> Self {
63        Self(value) // we don't ignore any bits
64    }
65}
66
67impl<const DEGREE_PLUS_ONE: usize> BinaryU64Poly<DEGREE_PLUS_ONE> {
68    /// Create a new polynomial with the given coefficients.
69    /// If the input has fewer than N+1 coefficients, the remaining slots will
70    /// be filled with zeros. If the input has more than N+1 coefficients,
71    /// it will panic.
72    #[inline(always)]
73    pub fn new(coeffs: impl AsRef<[Boolean]>) -> Self {
74        // Self(DensePolynomial::new(coeffs))
75        let coeffs = coeffs.as_ref();
76        assert!(coeffs.len() <= DEGREE_PLUS_ONE);
77        let mut value: u64 = 0;
78        for (i, coeff) in coeffs.iter().enumerate() {
79            if coeff.inner() {
80                value |= 1 << i;
81            }
82        }
83        Self(value)
84    }
85
86    /// Create a new polynomial with the given coefficients.
87    /// If the input has fewer than N+1 coefficients, the remaining slots will
88    /// be filled with zeros. If the input has more than N+1 coefficients,
89    /// it will panic.
90    #[inline(always)]
91    pub fn new_padded(coeffs: impl AsRef<[Boolean]>) -> Self {
92        let coeffs = coeffs.as_ref();
93        assert!(coeffs.len() <= DEGREE_PLUS_ONE);
94        let mut value: u64 = 0;
95        for (i, coeff) in coeffs.iter().enumerate() {
96            if coeff.inner() {
97                value |= 1 << i;
98            }
99        }
100        Self(value)
101    }
102}
103
104impl<const DEGREE_PLUS_ONE: usize> Zero for BinaryU64Poly<DEGREE_PLUS_ONE> {
105    #[inline(always)]
106    fn zero() -> Self {
107        Self(0)
108    }
109
110    #[inline(always)]
111    fn is_zero(&self) -> bool {
112        self.0.is_zero()
113    }
114}
115
116impl<const DEGREE_PLUS_ONE: usize> One for BinaryU64Poly<DEGREE_PLUS_ONE> {
117    #[inline(always)]
118    fn one() -> Self {
119        Self(1)
120    }
121}
122
123impl<'a, const DEGREE_PLUS_ONE: usize> Add<&'a Self> for BinaryU64Poly<DEGREE_PLUS_ONE> {
124    type Output = Self;
125
126    #[allow(clippy::arithmetic_side_effects, clippy::suspicious_arithmetic_impl)]
127    #[inline(always)]
128    fn add(self, rhs: &'a Self) -> Self::Output {
129        // addition in GF(2) is XOR
130        Self(self.0 ^ rhs.0)
131    }
132}
133
134impl<const DEGREE_PLUS_ONE: usize> Add for BinaryU64Poly<DEGREE_PLUS_ONE> {
135    type Output = Self;
136
137    #[allow(clippy::arithmetic_side_effects)]
138    #[inline(always)]
139    fn add(self, rhs: Self) -> Self::Output {
140        self.add(&rhs)
141    }
142}
143
144impl<'a, const DEGREE_PLUS_ONE: usize> Sub<&'a Self> for BinaryU64Poly<DEGREE_PLUS_ONE> {
145    type Output = Self;
146
147    #[allow(clippy::arithmetic_side_effects, clippy::suspicious_arithmetic_impl)]
148    #[inline(always)]
149    fn sub(self, rhs: &'a Self) -> Self::Output {
150        // subtraction in GF(2) is XOR
151        Self(self.0 ^ rhs.0)
152    }
153}
154
155impl<const DEGREE_PLUS_ONE: usize> Sub for BinaryU64Poly<DEGREE_PLUS_ONE> {
156    type Output = Self;
157
158    #[allow(clippy::arithmetic_side_effects)]
159    #[inline(always)]
160    fn sub(self, rhs: Self) -> Self::Output {
161        self.sub(&rhs)
162    }
163}
164
165impl<const DEGREE_PLUS_ONE: usize> Mul for BinaryU64Poly<DEGREE_PLUS_ONE> {
166    type Output = Self;
167
168    #[allow(clippy::arithmetic_side_effects)]
169    #[inline(always)]
170    fn mul(self, rhs: Self) -> Self::Output {
171        self.mul(&rhs)
172    }
173}
174
175impl<'a, const DEGREE_PLUS_ONE: usize> Mul<&'a Self> for BinaryU64Poly<DEGREE_PLUS_ONE> {
176    type Output = Self;
177
178    #[allow(clippy::arithmetic_side_effects)]
179    #[inline(always)]
180    fn mul(self, _rhs: &'a Self) -> Self::Output {
181        unimplemented!("Multiplication for BinaryU64Poly is not implemented yet");
182    }
183}
184
185#[allow(clippy::suspicious_op_assign_impl)]
186impl<'a, const DEGREE_PLUS_ONE: usize> AddAssign<&'a Self> for BinaryU64Poly<DEGREE_PLUS_ONE> {
187    #[inline(always)]
188    fn add_assign(&mut self, rhs: &'a Self) {
189        // addition in GF(2) is XOR
190        self.0 ^= rhs.0;
191    }
192}
193
194impl<const DEGREE_PLUS_ONE: usize> AddAssign for BinaryU64Poly<DEGREE_PLUS_ONE> {
195    #[inline(always)]
196    fn add_assign(&mut self, rhs: Self) {
197        self.add_assign(&rhs);
198    }
199}
200
201#[allow(clippy::suspicious_op_assign_impl)]
202impl<'a, const DEGREE_PLUS_ONE: usize> SubAssign<&'a Self> for BinaryU64Poly<DEGREE_PLUS_ONE> {
203    #[inline(always)]
204    fn sub_assign(&mut self, rhs: &'a Self) {
205        // subtraction in GF(2) is XOR
206        self.0 ^= rhs.0;
207    }
208}
209
210impl<const DEGREE_PLUS_ONE: usize> SubAssign for BinaryU64Poly<DEGREE_PLUS_ONE> {
211    #[inline(always)]
212    fn sub_assign(&mut self, rhs: Self) {
213        self.sub_assign(&rhs);
214    }
215}
216
217impl<const DEGREE_PLUS_ONE: usize> MulAssign for BinaryU64Poly<DEGREE_PLUS_ONE> {
218    #[inline(always)]
219    fn mul_assign(&mut self, _rhs: Self) {
220        unimplemented!("Multiplication for BinaryU64Poly is not implemented yet");
221    }
222}
223
224impl<'a, const DEGREE_PLUS_ONE: usize> MulAssign<&'a Self> for BinaryU64Poly<DEGREE_PLUS_ONE> {
225    #[inline(always)]
226    fn mul_assign(&mut self, _rhs: &'a Self) {
227        unimplemented!("Multiplication for BinaryU64Poly is not implemented yet");
228        // self.0.mul_assign(&rhs.0);
229    }
230}
231
232impl<const DEGREE_PLUS_ONE: usize> CheckedAdd for BinaryU64Poly<DEGREE_PLUS_ONE> {
233    #[inline(always)]
234    fn checked_add(&self, other: &Self) -> Option<Self> {
235        // If any bit is set in both operands, addition would overflow (1+1)
236        if (self.0 & other.0) != 0 {
237            None
238        } else {
239            // addition in GF(2) is XOR
240            Some(Self(self.0 ^ other.0))
241        }
242    }
243}
244
245impl<const DEGREE_PLUS_ONE: usize> CheckedSub for BinaryU64Poly<DEGREE_PLUS_ONE> {
246    #[inline(always)]
247    fn checked_sub(&self, other: &Self) -> Option<Self> {
248        // If any bit is 0 in self and 1 in other, subtraction would overflow (0-1)
249        if ((!self.0) & other.0) != 0 {
250            // There exists a bit where self is 0 and other is 1
251            None
252        } else {
253            // subtraction in GF(2) is XOR
254            Some(Self(self.0 ^ other.0))
255        }
256    }
257}
258
259impl<const DEGREE_PLUS_ONE: usize> CheckedMul for BinaryU64Poly<DEGREE_PLUS_ONE> {
260    #[inline(always)]
261    fn checked_mul(&self, _other: &Self) -> Option<Self> {
262        unimplemented!("Checked multiplication for BinaryU64Poly is not implemented yet");
263    }
264}
265
266impl<'a, const DEGREE_PLUS_ONE: usize> Sum<&'a Self> for BinaryU64Poly<DEGREE_PLUS_ONE> {
267    #[inline(always)]
268    fn sum<I: Iterator<Item = &'a Self>>(iter: I) -> Self {
269        iter.sum()
270    }
271}
272
273impl<'a, const DEGREE_PLUS_ONE: usize> Product<&'a Self> for BinaryU64Poly<DEGREE_PLUS_ONE> {
274    #[inline(always)]
275    fn product<I: Iterator<Item = &'a Self>>(iter: I) -> Self {
276        iter.product()
277    }
278}
279
280impl<const DEGREE_PLUS_ONE: usize> Semiring for BinaryU64Poly<DEGREE_PLUS_ONE> {}
281
282impl<const DEGREE_PLUS_ONE: usize> Distribution<BinaryU64Poly<DEGREE_PLUS_ONE>>
283    for StandardUniform
284{
285    #[inline(always)]
286    #[allow(clippy::arithmetic_side_effects)]
287    fn sample<Gen: Rng + ?Sized>(&self, rng: &mut Gen) -> BinaryU64Poly<DEGREE_PLUS_ONE> {
288        let mut value: u64 = rng.next_u64();
289        if DEGREE_PLUS_ONE < 64 {
290            value &= (1u64 << DEGREE_PLUS_ONE) - 1;
291        }
292        BinaryU64Poly::<DEGREE_PLUS_ONE>(value)
293    }
294}
295
296//
297// Zip-specific traits
298//
299impl<const DEGREE_PLUS_ONE: usize> Polynomial<Boolean> for BinaryU64Poly<DEGREE_PLUS_ONE> {
300    const DEGREE_BOUND: usize = DensePolynomial::<Boolean, DEGREE_PLUS_ONE>::DEGREE_BOUND;
301}
302
303#[allow(clippy::arithmetic_side_effects)]
304impl<R: Clone + Zero + One + CheckedAdd + CheckedMul, const DEGREE_PLUS_ONE: usize>
305    EvaluatablePolynomial<Boolean, R> for BinaryU64Poly<DEGREE_PLUS_ONE>
306{
307    type EvaluationPoint = R;
308
309    fn evaluate_at_point(&self, point: &R) -> Result<R, EvaluationError> {
310        if DEGREE_PLUS_ONE.is_one() {
311            return Ok(R::zero());
312        }
313
314        let mut result = R::zero();
315        let mut pow = R::one();
316        for i in 0..DEGREE_PLUS_ONE {
317            if (self.0 & (1 << i)) != 0 {
318                result = result.checked_add(&pow).ok_or(EvaluationError::Overflow)?;
319            }
320            if i + 1 < DEGREE_PLUS_ONE {
321                pow = pow.checked_mul(point).ok_or(EvaluationError::Overflow)?;
322            }
323        }
324
325        Ok(result)
326    }
327}
328
329impl<const DEGREE_PLUS_ONE: usize> ConstCoeffBitWidth for BinaryU64Poly<DEGREE_PLUS_ONE> {
330    const COEFF_BIT_WIDTH: usize = DensePolynomial::<Boolean, DEGREE_PLUS_ONE>::COEFF_BIT_WIDTH;
331}
332
333impl<const DEGREE_PLUS_ONE: usize> Named for BinaryU64Poly<DEGREE_PLUS_ONE> {
334    fn type_name() -> String {
335        format!("BPoly<{}>", Self::DEGREE_BOUND)
336    }
337}
338
339delegate_const_transcribable!(BinaryU64Poly<const DEGREE_PLUS_ONE: usize>(u64));
340
341impl<const DEGREE_PLUS_ONE: usize> FromRef<BinaryU64Poly<DEGREE_PLUS_ONE>>
342    for BinaryU64Poly<DEGREE_PLUS_ONE>
343{
344    #[inline(always)]
345    fn from_ref(poly: &BinaryU64Poly<DEGREE_PLUS_ONE>) -> Self {
346        poly.clone()
347    }
348}
349
350impl<const DEGREE_PLUS_ONE: usize> From<&BinaryU64Poly<DEGREE_PLUS_ONE>>
351    for BinaryU64Poly<DEGREE_PLUS_ONE>
352{
353    #[inline(always)]
354    fn from(value: &BinaryU64Poly<DEGREE_PLUS_ONE>) -> Self {
355        Self::from_ref(value)
356    }
357}
358
359pub struct BinaryU64PolyIter<'a, const DEGREE_PLUS_ONE: usize> {
360    i: usize,
361    poly: &'a BinaryU64Poly<DEGREE_PLUS_ONE>,
362}
363
364impl<'a, const DEGREE_PLUS_ONE: usize> BinaryU64PolyIter<'a, DEGREE_PLUS_ONE> {
365    pub fn new(poly: &'a BinaryU64Poly<DEGREE_PLUS_ONE>) -> Self {
366        Self { i: 0, poly }
367    }
368}
369
370impl<'a, const DEGREE_PLUS_ONE: usize> Iterator for BinaryU64PolyIter<'a, DEGREE_PLUS_ONE> {
371    type Item = Boolean;
372
373    #[allow(clippy::arithmetic_side_effects)]
374    fn next(&mut self) -> Option<Self::Item> {
375        if self.i < DEGREE_PLUS_ONE {
376            let i = self.i;
377
378            self.i += 1;
379
380            Some((!(self.poly.0 & (1 << i)).is_zero()).into())
381        } else {
382            None
383        }
384    }
385}
386
387impl<const DEGREE_PLUS_ONE: usize> BinaryU64Poly<DEGREE_PLUS_ONE> {
388    pub fn iter(&self) -> BinaryU64PolyIter<'_, DEGREE_PLUS_ONE> {
389        BinaryU64PolyIter::new(self)
390    }
391}
392
393#[derive(Clone, Debug)]
394pub struct BinaryU64PolyInnerProduct<R, const DEGREE_PLUS_ONE: usize>(PhantomData<R>);
395
396impl<Rhs, Out, const DEGREE_PLUS_ONE: usize> InnerProduct<BinaryU64Poly<DEGREE_PLUS_ONE>, Rhs, Out>
397    for BinaryU64PolyInnerProduct<Rhs, DEGREE_PLUS_ONE>
398where
399    Rhs: Clone,
400    Out: FromRef<Rhs> + CheckedAdd,
401{
402    #[inline(always)]
403    #[allow(clippy::arithmetic_side_effects)] // By design
404    fn inner_product<const CHECK: bool>(
405        lhs: &BinaryU64Poly<DEGREE_PLUS_ONE>,
406        rhs: &[Rhs],
407        zero: Out,
408    ) -> Result<Out, InnerProductError> {
409        if rhs.len() != DEGREE_PLUS_ONE {
410            return Err(InnerProductError::LengthMismatch {
411                lhs: DEGREE_PLUS_ONE,
412                rhs: rhs.len(),
413            });
414        }
415
416        let mut acc = zero;
417        let mut bits = lhs.0;
418        while bits != 0 {
419            let i = bits.trailing_zeros() as usize;
420            let rhs = Out::from_ref(&rhs[i]);
421            if CHECK {
422                acc = acc.checked_add(&rhs).ok_or(InnerProductError::Overflow)?;
423            } else {
424                acc = acc + rhs;
425            }
426            // changes the LSB 1 bit to 0
427            bits &= bits - 1;
428        }
429
430        Ok(acc)
431    }
432}
433
434impl<F, const DEGREE_PLUS_ONE: usize> ProjectableToField<F> for BinaryU64Poly<DEGREE_PLUS_ONE>
435where
436    F: PrimeField + FromRef<F> + 'static,
437{
438    fn prepare_projection(sampled_value: &F) -> impl Fn(&Self) -> F + 'static {
439        prepare_projection::<F, Self, _, DEGREE_PLUS_ONE>(sampled_value, |poly, i| {
440            (poly.0 & (1 << i)) != 0
441        })
442    }
443}
444
445impl<const DEGREE_PLUS_ONE: usize> MulByScalar<&i64, DensePolynomial<i64, DEGREE_PLUS_ONE>>
446    for BinaryU64Poly<DEGREE_PLUS_ONE>
447{
448    fn mul_by_scalar<const CHECK: bool>(
449        &self,
450        rhs: &i64,
451    ) -> Option<DensePolynomial<i64, DEGREE_PLUS_ONE>> {
452        Some(widen_simd::<DEGREE_PLUS_ONE>(self, *rhs))
453    }
454}
455
456#[allow(unreachable_code, unused_variables)] // CI system does not support SIMD features
457#[inline(always)]
458pub fn widen_simd<const DEGREE_PLUS_ONE: usize>(
459    poly: &BinaryU64Poly<DEGREE_PLUS_ONE>,
460    scalar: i64,
461) -> DensePolynomial<i64, DEGREE_PLUS_ONE> {
462    let mut coeffs_uninit = MaybeUninit::<[i64; DEGREE_PLUS_ONE]>::uninit();
463    let out_ptr = coeffs_uninit.as_mut_ptr() as *mut i64;
464
465    #[cfg(target_arch = "aarch64")]
466    unsafe {
467        widen_fill_neon::<DEGREE_PLUS_ONE>(&poly.0, out_ptr, scalar);
468    }
469    #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
470    unsafe {
471        widen_fill_avx512::<DEGREE_PLUS_ONE>(&poly.0, out_ptr, scalar);
472    }
473    #[cfg(not(any(
474        target_arch = "aarch64",
475        all(target_arch = "x86_64", target_feature = "avx512f")
476    )))]
477    {
478        panic!("SIMD widening not supported on this architecture");
479    }
480
481    let coeffs = unsafe { coeffs_uninit.assume_init() };
482    DensePolynomial { coeffs }
483}
484
485#[allow(
486    clippy::arithmetic_side_effects,
487    clippy::cast_possible_truncation,
488    clippy::cast_lossless
489)]
490#[allow(unsafe_op_in_unsafe_fn)]
491#[cfg(target_arch = "aarch64")]
492#[inline(always)]
493// Converts a u64 bitmask into an array of i64 values using ARM NEON SIMD.
494// Processes 8 bits at a time with ~8-12× speedup vs scalar code.
495unsafe fn widen_fill_neon<const N: usize>(mask_ref: &u64, out_ptr: *mut i64, scalar: i64) {
496    use core::arch::aarch64::*;
497
498    let mask64: u64 = *mask_ref;
499
500    // Replicate scalar across SIMD lanes for branchless selection
501    let scalar_v: int64x2_t = vdupq_n_s64(scalar);
502
503    let mut i = 0usize;
504
505    // For N == 64, use lookup table (16KB). For smaller N, use SIMD widening
506    if N == 64 {
507        // Precomputed lookup table: maps each byte value (0-255) directly to 8 i64
508        // values Each bit in the byte produces -1 (if set) or 0 (if clear)
509        // Table size: 256 entries × 8 i64 × 8 bytes = 16KB
510        #[repr(align(64))]
511        struct LookupTable([[i64; 8]; 256]);
512
513        static LUT: LookupTable = {
514            let mut table = [[0i64; 8]; 256];
515            let mut i = 0;
516            while i < 256 {
517                let mut bit = 0;
518                while bit < 8 {
519                    table[i][bit] = if (i & (1 << bit)) != 0 { -1 } else { 0 };
520                    bit += 1;
521                }
522                i += 1;
523            }
524            LookupTable(table)
525        };
526
527        // Main loop: process 8 coefficients per iteration
528        while i + 8 <= N {
529            let shift = i as u32;
530            let byte: u8 = ((mask64 >> shift) & 0xFF) as u8;
531
532            // Direct table lookup: load 8 i64 values (4 int64x2_t vectors)
533            let masks_ptr = LUT.0[byte as usize].as_ptr();
534            let m0: int64x2_t = vld1q_s64(masks_ptr);
535            let m1: int64x2_t = vld1q_s64(masks_ptr.add(2));
536            let m2: int64x2_t = vld1q_s64(masks_ptr.add(4));
537            let m3: int64x2_t = vld1q_s64(masks_ptr.add(6));
538
539            // Branchless select: scalar & -1 = scalar, scalar & 0 = 0
540            vst1q_s64(out_ptr.add(i), vandq_s64(scalar_v, m0));
541            vst1q_s64(out_ptr.add(i + 2), vandq_s64(scalar_v, m1));
542            vst1q_s64(out_ptr.add(i + 4), vandq_s64(scalar_v, m2));
543            vst1q_s64(out_ptr.add(i + 6), vandq_s64(scalar_v, m3));
544
545            i += 8;
546        }
547    } else {
548        // Compile-time constant for bit masks [1,2,4,8,16,32,64,128]
549        const BIT_MASKS: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128];
550        let bit_masks: uint8x8_t = vld1_u8(BIT_MASKS.as_ptr());
551        let zero_u8: uint8x8_t = vdup_n_u8(0);
552
553        // Main loop: process 8 coefficients per iteration
554        while i + 8 <= N {
555            let shift = i as u32;
556            let byte: u8 = ((mask64 >> shift) & 0xFF) as u8;
557
558            // Extract bits: replicate byte and AND with bit masks to isolate each bit
559            let vbyte: uint8x8_t = vdup_n_u8(byte);
560            let selected: uint8x8_t = vand_u8(vbyte, bit_masks);
561            let nz: uint8x8_t = vcgt_u8(selected, zero_u8); // 0xFF if bit set, 0x00 if not
562
563            // Sign-extend 0xFF → 0xFFFF...FFFF, 0x00 → 0x0000...0000 via signed widening
564            let nz_signed: int8x8_t = vreinterpret_s8_u8(nz); // Reinterpret: 0xFF = -1, 0x00 = 0
565            let s16: int16x8_t = vmovl_s8(nz_signed); // -1 → 0xFFFF, 0 → 0x0000
566            let lo32: int32x4_t = vmovl_s16(vget_low_s16(s16));
567            let hi32: int32x4_t = vmovl_s16(vget_high_s16(s16));
568
569            let m0: int64x2_t = vmovl_s32(vget_low_s32(lo32)); // -1 → 0xFFFF...FFFF
570            let m1: int64x2_t = vmovl_s32(vget_high_s32(lo32));
571            let m2: int64x2_t = vmovl_s32(vget_low_s32(hi32));
572            let m3: int64x2_t = vmovl_s32(vget_high_s32(hi32));
573
574            // Branchless select: scalar & 0xFFFF... = scalar, scalar & 0x0000... = 0
575            vst1q_s64(out_ptr.add(i), vandq_s64(scalar_v, m0));
576            vst1q_s64(out_ptr.add(i + 2), vandq_s64(scalar_v, m1));
577            vst1q_s64(out_ptr.add(i + 4), vandq_s64(scalar_v, m2));
578            vst1q_s64(out_ptr.add(i + 6), vandq_s64(scalar_v, m3));
579
580            i += 8;
581        }
582    }
583
584    // Tail: handle remaining coefficients one at a time
585    while i < N {
586        let bit = ((mask64 >> i) & 1) != 0;
587        let mask = -(bit as i64); // 0 or -1 (all bits set)
588        *out_ptr.add(i) = scalar & mask;
589        i += 1;
590    }
591}
592
593#[allow(
594    clippy::arithmetic_side_effects,
595    clippy::cast_possible_truncation,
596    clippy::cast_lossless
597)]
598#[allow(unsafe_op_in_unsafe_fn)]
599#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
600#[inline(always)]
601// Converts a u64 bitmask into an array of i64 values using AVX512 SIMD.
602// Processes 32 bits at a time with 4-way unrolling for instruction-level
603// parallelism. Significantly cleaner than NEON due to native mask register
604// support.
605unsafe fn widen_fill_avx512<const N: usize>(mask_ref: &u64, out_ptr: *mut i64, scalar: i64) {
606    #[cfg(target_arch = "x86_64")]
607    use core::arch::x86_64::*;
608
609    let mask64: u64 = *mask_ref;
610    let mut i = 0usize;
611
612    // Unrolled loop: process 32 coefficients (4 x 8) per iteration for ILP
613    // Interleave independent operations to keep multiple execution units busy
614    while i + 32 <= N {
615        let shift = i as u32;
616
617        // Extract all 4 bytes at once - no dependencies between extractions
618        let byte0: u8 = ((mask64 >> shift) & 0xFF) as u8;
619        let byte1: u8 = ((mask64 >> (shift + 8)) & 0xFF) as u8;
620        let byte2: u8 = ((mask64 >> (shift + 16)) & 0xFF) as u8;
621        let byte3: u8 = ((mask64 >> (shift + 24)) & 0xFF) as u8;
622
623        // Convert to mask registers - independent operations
624        let kmask0: __mmask8 = byte0;
625        let kmask1: __mmask8 = byte1;
626        let kmask2: __mmask8 = byte2;
627        let kmask3: __mmask8 = byte3;
628
629        // Predicated broadcasts - all can execute in parallel on different ports
630        let result0: __m512i = _mm512_maskz_set1_epi64(kmask0, scalar);
631        let result1: __m512i = _mm512_maskz_set1_epi64(kmask1, scalar);
632        let result2: __m512i = _mm512_maskz_set1_epi64(kmask2, scalar);
633        let result3: __m512i = _mm512_maskz_set1_epi64(kmask3, scalar);
634
635        // Stores - can pipeline as they have no dependencies on each other
636        _mm512_storeu_si512(out_ptr.add(i) as *mut __m512i, result0);
637        _mm512_storeu_si512(out_ptr.add(i + 8) as *mut __m512i, result1);
638        _mm512_storeu_si512(out_ptr.add(i + 16) as *mut __m512i, result2);
639        _mm512_storeu_si512(out_ptr.add(i + 24) as *mut __m512i, result3);
640
641        i += 32;
642    }
643
644    // Handle remaining full vectors (8 coefficients at a time)
645    while i + 8 <= N {
646        let shift = i as u32;
647        let byte: u8 = ((mask64 >> shift) & 0xFF) as u8;
648        let kmask: __mmask8 = byte;
649        let result: __m512i = _mm512_maskz_set1_epi64(kmask, scalar);
650        _mm512_storeu_si512(out_ptr.add(i) as *mut __m512i, result);
651        i += 8;
652    }
653
654    // Tail: handle remaining coefficients one at a time
655    while i < N {
656        let bit = ((mask64 >> i) & 1) != 0;
657        let mask = -(bit as i64); // 0 or -1 (all bits set)
658        *out_ptr.add(i) = scalar & mask;
659        i += 1;
660    }
661}
662
663#[cfg(test)]
664mod tests {
665    use crate::univariate::binary_ref::BinaryRefPoly;
666    use zinc_transcript::traits::{ConstTranscribable, GenTranscribable};
667    use zinc_utils::CHECKED;
668
669    use super::*;
670
671    #[test]
672    fn evaluate_is_correct() {
673        for i in 0..16 {
674            let poly = BinaryU64Poly::<4>::new([
675                (i & 0b0001 != 0).into(),
676                (i & 0b0010 != 0).into(),
677                (i & 0b0100 != 0).into(),
678                (i & 0b1000 != 0).into(),
679            ]);
680
681            let result = poly.evaluate_at_point(&2).unwrap();
682
683            assert_eq!(result, i);
684        }
685    }
686
687    #[test]
688    fn transcribable() {
689        let original =
690            BinaryU64Poly::<4>::new([true.into(), false.into(), true.into(), false.into()]);
691        let mut bytes = [0u8; BinaryU64Poly::<4>::NUM_BYTES];
692        assert_eq!(bytes.len(), u64::NUM_BYTES);
693
694        original.write_transcription_bytes_exact(&mut bytes);
695        let deserialized = BinaryU64Poly::<4>::read_transcription_bytes_exact(&bytes);
696        assert_eq!(original, deserialized);
697    }
698
699    fn widen_ref<const DEGREE_PLUS_ONE: usize>(
700        poly: &BinaryU64Poly<DEGREE_PLUS_ONE>,
701        scalar: i64,
702    ) -> DensePolynomial<i64, DEGREE_PLUS_ONE> {
703        let mut coeffs: [i64; DEGREE_PLUS_ONE] = [0; DEGREE_PLUS_ONE];
704        for (i, coeff) in coeffs.iter_mut().enumerate().take(DEGREE_PLUS_ONE) {
705            if (poly.0 & (1 << i)) != 0 {
706                *coeff = scalar;
707            }
708        }
709        DensePolynomial { coeffs }
710    }
711
712    #[ignore = "CI system does not support SIMD features"]
713    #[test]
714    fn widen_ref_and_widen_ref_simd_match() {
715        // Test with degree 4
716        for i in 0..16 {
717            let poly = BinaryU64Poly::<4>::new([
718                (i & 0b0001 != 0).into(),
719                (i & 0b0010 != 0).into(),
720                (i & 0b0100 != 0).into(),
721                (i & 0b1000 != 0).into(),
722            ]);
723
724            let poly_ref = BinaryRefPoly::<4>::new([
725                (i & 0b0001 != 0).into(),
726                (i & 0b0010 != 0).into(),
727                (i & 0b0100 != 0).into(),
728                (i & 0b1000 != 0).into(),
729            ]);
730
731            for scalar in [1, 42, -7, 100, -100, i64::MAX, i64::MIN] {
732                let result_simd_ref = widen_ref(&poly, scalar);
733                let result_simd = widen_simd(&poly, scalar);
734                let result_ref = poly_ref.mul_by_scalar::<CHECKED>(&scalar).unwrap();
735
736                assert_eq!(
737                    result_simd_ref.coeffs, result_simd.coeffs,
738                    "Mismatch for pattern {} with scalar {}",
739                    i, scalar
740                );
741                assert_eq!(
742                    result_simd_ref.coeffs, result_ref.coeffs,
743                    "Mismatch for pattern {} with scalar {} between ref and MulByScalar",
744                    i, scalar
745                );
746            }
747        }
748
749        let coeffs: Vec<_> = [
750            1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
751            0, 1, 0,
752        ]
753        .into_iter()
754        .map(|x| (x != 0).into())
755        .collect();
756
757        let poly32 = BinaryU64Poly::<32>::new(coeffs.clone());
758        let poly32_ref = BinaryRefPoly::<32>::new(coeffs);
759
760        for scalar in [1, 42, -7, 100, -100] {
761            let result_simd_ref = widen_ref(&poly32, scalar);
762            let result_simd = widen_simd(&poly32, scalar);
763            let result_ref = poly32_ref.mul_by_scalar::<CHECKED>(&scalar).unwrap();
764            assert_eq!(
765                result_simd_ref.coeffs, result_simd.coeffs,
766                "Mismatch for degree 32 with scalar {}",
767                scalar
768            );
769            assert_eq!(
770                result_simd_ref.coeffs, result_ref.coeffs,
771                "Mismatch for degree 32 with scalar {} between ref and MulByScalar",
772                scalar
773            );
774        }
775
776        // Test with all zeros
777        let poly_zeros = BinaryU64Poly::<16>::new([false.into(); 16]);
778        let result_ref_zeros = widen_ref(&poly_zeros, 42);
779        let result_simd_zeros = widen_simd(&poly_zeros, 42);
780        assert_eq!(result_ref_zeros.coeffs, result_simd_zeros.coeffs);
781
782        // Test with all ones
783        let poly_ones = BinaryU64Poly::<16>::new([true.into(); 16]);
784        let result_ref_ones = widen_ref(&poly_ones, 42);
785        let result_simd_ones = widen_simd(&poly_ones, 42);
786        assert_eq!(result_ref_ones.coeffs, result_simd_ones.coeffs);
787    }
788}