@@ -20,6 +20,10 @@ internal static class AvxIntrinsics
2020 {
2121 private static readonly Vector256 < float > _absMask256 = Avx . StaticCast < int , float > ( Avx . SetAllVector256 ( 0x7FFFFFFF ) ) ;
2222
23+ // The count of 32-bit floats in Vector256<T>
24+ private const int AvxAlignment = 8 ;
25+
26+ // The count of bytes in Vector256<T>, corresponding to _cbAlign in AlignedArray
2327 private const int Vector256Alignment = 32 ;
2428
2529 [ MethodImplAttribute ( MethodImplOptions . AggressiveInlining ) ]
@@ -415,32 +419,32 @@ public static unsafe void AddScalarU(float scalar, Span<float> dst)
415419 {
416420 fixed ( float * pdst = dst )
417421 {
418- float * pDstEnd = pdst + dst . Length ;
419- float * pDstCurrent = pdst ;
420-
421422 Vector256 < float > scalarVector256 = Avx . SetAllVector256 ( scalar ) ;
423+ int countAvx = Math . DivRem ( dst . Length , AvxAlignment , out int remainderAvx ) ;
424+ float * pDstCurrent = pdst ;
422425
423- while ( pDstCurrent + 8 <= pDstEnd )
426+ for ( int i = 0 ; i < countAvx ; i ++ )
424427 {
425428 Vector256 < float > dstVector = Avx . LoadVector256 ( pDstCurrent ) ;
426429 dstVector = Avx . Add ( dstVector , scalarVector256 ) ;
427430 Avx . Store ( pDstCurrent , dstVector ) ;
428431
429- pDstCurrent += 8 ;
432+ pDstCurrent += AvxAlignment ;
430433 }
431434
432435 Vector128 < float > scalarVector128 = Sse . SetAllVector128 ( scalar ) ;
436+ int countSse = Math . DivRem ( remainderAvx , SseIntrinsics . SseAlignment , out int remainderSse ) ;
433437
434- if ( pDstCurrent + 4 <= pDstEnd )
438+ if ( countSse > 0 )
435439 {
436440 Vector128 < float > dstVector = Sse . LoadVector128 ( pDstCurrent ) ;
437441 dstVector = Sse . Add ( dstVector , scalarVector128 ) ;
438442 Sse . Store ( pDstCurrent , dstVector ) ;
439443
440- pDstCurrent += 4 ;
444+ pDstCurrent += SseIntrinsics . SseAlignment ;
441445 }
442446
443- while ( pDstCurrent < pDstEnd )
447+ for ( int i = 0 ; i < remainderSse ; i ++ )
444448 {
445449 Vector128 < float > dstVector = Sse . LoadScalarVector128 ( pDstCurrent ) ;
446450 dstVector = Sse . AddScalar ( dstVector , scalarVector128 ) ;
0 commit comments