From 35d2afa0bb4be7e50d26d5ae5435dbcaa6ece4c9 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 1 Nov 2021 20:18:21 +0100 Subject: [PATCH 1/3] Add sse2 version of select --- .../Formats/Webp/Lossless/LosslessUtils.cs | 60 +++++++++++++++---- .../Formats/Webp/Lossless/PredictorEncoder.cs | 27 +++++---- 2 files changed, 64 insertions(+), 23 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index b7f94415be..7e21517d20 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -27,6 +27,10 @@ internal static unsafe class LosslessUtils private const double Log2Reciprocal = 1.44269504088896338700465094007086; +#if SUPPORTS_RUNTIME_INTRINSICS + private static readonly Vector128 Zero = Vector128.Create(0).AsByte(); +#endif + /// /// Returns the exact index where array1 and array2 are different. For an index /// inferior or equal to bestLenMatch, the return value just has to be strictly @@ -551,6 +555,7 @@ public static void PredictorInverseTransform( int mask = tileWidth - 1; int tilesPerRow = SubSampleSize(width, transform.Bits); int predictorModeIdxBase = (y >> transform.Bits) * tilesPerRow; + Span scratch = stackalloc short[8]; while (y < yEnd) { int predictorModeIdx = predictorModeIdxBase; @@ -608,7 +613,7 @@ public static void PredictorInverseTransform( PredictorAdd10(input + x, output + x - width, xEnd - x, output + x); break; case 11: - PredictorAdd11(input + x, output + x - width, xEnd - x, output + x); + PredictorAdd11(input + x, output + x - width, xEnd - x, output + x, scratch); break; case 12: PredictorAdd12(input + x, output + x - width, xEnd - x, output + x); @@ -974,11 +979,11 @@ private static void PredictorAdd10(uint* input, uint* upper, int numberOfPixels, } [MethodImpl(InliningOptions.ShortMethod)] - private static void PredictorAdd11(uint* input, uint* upper, int numberOfPixels, uint* output) + private static void PredictorAdd11(uint* input, uint* upper, int numberOfPixels, uint* output, Span scratch) { for (int x = 0; x < numberOfPixels; x++) { - uint pred = Predictor11(output[x - 1], upper + x); + uint pred = Predictor11(output[x - 1], upper + x, scratch); output[x] = AddPixels(input[x], pred); } } @@ -1031,7 +1036,7 @@ private static void PredictorAdd13(uint* input, uint* upper, int numberOfPixels, public static uint Predictor10(uint left, uint* top) => Average4(left, top[-1], top[0], top[1]); [MethodImpl(InliningOptions.ShortMethod)] - public static uint Predictor11(uint left, uint* top) => Select(top[0], left, top[-1]); + public static uint Predictor11(uint left, uint* top, Span scratch) => Select(top[0], left, top[-1], scratch); [MethodImpl(InliningOptions.ShortMethod)] public static uint Predictor12(uint left, uint* top) => ClampedAddSubtractFull(left, top[0], top[-1]); @@ -1148,11 +1153,11 @@ public static void PredictorSub10(uint* input, uint* upper, int numPixels, uint* } [MethodImpl(InliningOptions.ShortMethod)] - public static void PredictorSub11(uint* input, uint* upper, int numPixels, uint* output) + public static void PredictorSub11(uint* input, uint* upper, int numPixels, uint* output, Span scratch) { for (int x = 0; x < numPixels; x++) { - uint pred = Predictor11(input[x - 1], upper + x); + uint pred = Predictor11(input[x - 1], upper + x, scratch); output[x] = SubPixels(input[x], pred); } } @@ -1240,14 +1245,43 @@ private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2) private static Vector128 MkCst16(int hi, int lo) => Vector128.Create((hi << 16) | (lo & 0xffff)); #endif - private static uint Select(uint a, uint b, uint c) + private static uint Select(uint a, uint b, uint c, Span scratch) { - int paMinusPb = - Sub3((int)(a >> 24), (int)(b >> 24), (int)(c >> 24)) + - Sub3((int)((a >> 16) & 0xff), (int)((b >> 16) & 0xff), (int)((c >> 16) & 0xff)) + - Sub3((int)((a >> 8) & 0xff), (int)((b >> 8) & 0xff), (int)((c >> 8) & 0xff)) + - Sub3((int)(a & 0xff), (int)(b & 0xff), (int)(c & 0xff)); - return paMinusPb <= 0 ? a : b; +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { + Span output = scratch; + fixed (short* p = output) + { + Vector128 a0 = Sse2.ConvertScalarToVector128UInt32(a).AsByte(); + Vector128 b0 = Sse2.ConvertScalarToVector128UInt32(b).AsByte(); + Vector128 c0 = Sse2.ConvertScalarToVector128UInt32(c).AsByte(); + Vector128 ac0 = Sse2.SubtractSaturate(a0, c0); + Vector128 ca0 = Sse2.SubtractSaturate(c0, a0); + Vector128 bc0 = Sse2.SubtractSaturate(b0, c0); + Vector128 cb0 = Sse2.SubtractSaturate(c0, b0); + Vector128 ac = Sse2.Or(ac0, ca0); + Vector128 bc = Sse2.Or(bc0, cb0); + Vector128 pa = Sse2.UnpackLow(ac, Zero); // |a - c| + Vector128 pb = Sse2.UnpackLow(bc, Zero); // |b - c| + Vector128 diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16()); + Sse2.Store((ushort*)p, diff); + } + + int paMinusPb = output[0] + output[1] + output[2] + output[3]; + + return (paMinusPb <= 0) ? a : b; + } + else +#endif + { + int paMinusPb = + Sub3((int)(a >> 24), (int)(b >> 24), (int)(c >> 24)) + + Sub3((int)((a >> 16) & 0xff), (int)((b >> 16) & 0xff), (int)((c >> 16) & 0xff)) + + Sub3((int)((a >> 8) & 0xff), (int)((b >> 8) & 0xff), (int)((c >> 8) & 0xff)) + + Sub3((int)(a & 0xff), (int)(b & 0xff), (int)(c & 0xff)); + return paMinusPb <= 0 ? a : b; + } } [MethodImpl(InliningOptions.ShortMethod)] diff --git a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs index 671e9a043e..2c70faa0d8 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs @@ -50,6 +50,7 @@ public static void ResidualImage( int tilesPerRow = LosslessUtils.SubSampleSize(width, bits); int tilesPerCol = LosslessUtils.SubSampleSize(height, bits); int maxQuantization = 1 << LosslessUtils.NearLosslessBits(nearLosslessQuality); + Span scratch = stackalloc short[8]; // TODO: Can we optimize this? int[][] histo = new int[4][]; @@ -84,7 +85,8 @@ public static void ResidualImage( transparentColorMode, usedSubtractGreen, nearLossless, - image); + image, + scratch); image[(tileY * tilesPerRow) + tileX] = (uint)(WebpConstants.ArgbBlack | (pred << 8)); } @@ -192,7 +194,8 @@ private static int GetBestPredictorForTile( WebpTransparentColorMode transparentColorMode, bool usedSubtractGreen, bool nearLossless, - Span modes) + Span modes, + Span scratch) { const int numPredModes = 14; int startX = tileX << bits; @@ -272,7 +275,7 @@ private static int GetBestPredictorForTile( } } - GetResidual(width, height, upperRow, currentRow, maxDiffs, mode, startX, startX + maxX, y, maxQuantization, transparentColorMode, usedSubtractGreen, nearLossless, residuals); + GetResidual(width, height, upperRow, currentRow, maxDiffs, mode, startX, startX + maxX, y, maxQuantization, transparentColorMode, usedSubtractGreen, nearLossless, residuals, scratch); for (int relativeX = 0; relativeX < maxX; ++relativeX) { UpdateHisto(histoArgb, residuals[relativeX]); @@ -333,11 +336,12 @@ private static void GetResidual( WebpTransparentColorMode transparentColorMode, bool usedSubtractGreen, bool nearLossless, - Span output) + Span output, + Span scratch) { if (transparentColorMode == WebpTransparentColorMode.Preserve) { - PredictBatch(mode, xStart, y, xEnd - xStart, currentRowSpan, upperRowSpan, output); + PredictBatch(mode, xStart, y, xEnd - xStart, currentRowSpan, upperRowSpan, output, scratch); } else { @@ -395,7 +399,7 @@ private static void GetResidual( predict = LosslessUtils.Predictor10(currentRow[x - 1], upperRow + x); break; case 11: - predict = LosslessUtils.Predictor11(currentRow[x - 1], upperRow + x); + predict = LosslessUtils.Predictor11(currentRow[x - 1], upperRow + x, scratch); break; case 12: predict = LosslessUtils.Predictor12(currentRow[x - 1], upperRow + x); @@ -583,6 +587,7 @@ private static void CopyImageWithPrediction( Span currentMaxDiffs = MemoryMarshal.Cast(currentRow.Slice(width + 1)); Span lowerMaxDiffs = currentMaxDiffs.Slice(width); + Span scratch = stackalloc short[8]; for (int y = 0; y < height; y++) { Span tmp32 = upperRow; @@ -593,7 +598,7 @@ private static void CopyImageWithPrediction( if (lowEffort) { - PredictBatch(PredLowEffort, 0, y, width, currentRow, upperRow, argb.Slice(y * width)); + PredictBatch(PredLowEffort, 0, y, width, currentRow, upperRow, argb.Slice(y * width), scratch); } else { @@ -634,7 +639,8 @@ private static void CopyImageWithPrediction( transparentColorMode, usedSubtractGreen, nearLossless, - argb.Slice((y * width) + x)); + argb.Slice((y * width) + x), + scratch); x = xEnd; } @@ -649,7 +655,8 @@ private static void PredictBatch( int numPixels, Span currentSpan, Span upperSpan, - Span outputSpan) + Span outputSpan, + Span scratch) { #pragma warning disable SA1503 // Braces should not be omitted fixed (uint* current = currentSpan) @@ -718,7 +725,7 @@ private static void PredictBatch( LosslessUtils.PredictorSub10(current + xStart, upper + xStart, numPixels, output); break; case 11: - LosslessUtils.PredictorSub11(current + xStart, upper + xStart, numPixels, output); + LosslessUtils.PredictorSub11(current + xStart, upper + xStart, numPixels, output, scratch); break; case 12: LosslessUtils.PredictorSub12(current + xStart, upper + xStart, numPixels, output); From de6bd9de7953d693b6e1a04007b2796507f65e0f Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 1 Nov 2021 21:29:10 +0100 Subject: [PATCH 2/3] Use Vector128.Zero --- src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 7e21517d20..22c2333607 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -27,10 +27,6 @@ internal static unsafe class LosslessUtils private const double Log2Reciprocal = 1.44269504088896338700465094007086; -#if SUPPORTS_RUNTIME_INTRINSICS - private static readonly Vector128 Zero = Vector128.Create(0).AsByte(); -#endif - /// /// Returns the exact index where array1 and array2 are different. For an index /// inferior or equal to bestLenMatch, the return value just has to be strictly @@ -1262,8 +1258,8 @@ private static uint Select(uint a, uint b, uint c, Span scratch) Vector128 cb0 = Sse2.SubtractSaturate(c0, b0); Vector128 ac = Sse2.Or(ac0, ca0); Vector128 bc = Sse2.Or(bc0, cb0); - Vector128 pa = Sse2.UnpackLow(ac, Zero); // |a - c| - Vector128 pb = Sse2.UnpackLow(bc, Zero); // |b - c| + Vector128 pa = Sse2.UnpackLow(ac, Vector128.Zero); // |a - c| + Vector128 pb = Sse2.UnpackLow(bc, Vector128.Zero); // |b - c| Vector128 diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16()); Sse2.Store((ushort*)p, diff); } From 143de220b75abd8bf44f7943650a36cbaa3f7421 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 2 Nov 2021 10:55:49 +0100 Subject: [PATCH 3/3] Add Predictor11 test --- .../Formats/WebP/LosslessUtilsTests.cs | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs index be7bc27d3a..bf381ebdaa 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs @@ -132,6 +132,30 @@ private static void RunTransformColorInverseTest() Assert.Equal(expectedOutput, pixelData); } + private static void RunPredictor11Test() + { + // arrange + uint[] topData = { 4278258949, 4278258949 }; + uint left = 4294839812; + short[] scratch = new short[8]; + uint expectedResult = 4294839812; + + // act + unsafe + { + fixed (uint* top = &topData[1]) + { + uint actual = LosslessUtils.Predictor11(left, top, scratch); + + // assert + Assert.Equal(expectedResult, actual); + } + } + } + + [Fact] + public void Predictor11_Works() => RunPredictor11Test(); + [Fact] public void SubtractGreen_Works() => RunSubtractGreenTest(); @@ -145,6 +169,12 @@ private static void RunTransformColorInverseTest() public void TransformColorInverse_Works() => RunTransformColorInverseTest(); #if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void Predictor11_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor11Test, HwIntrinsics.AllowAll); + + [Fact] + public void Predictor11_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor11Test, HwIntrinsics.DisableSSE2); + [Fact] public void SubtractGreen_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.AllowAll);