1 // Copyright 2014 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // Utilities for processing transparent channel. 11 // 12 // Author: Skal (pascal.massimino (at) gmail.com) 13 14 #include "src/dsp/dsp.h" 15 16 #if defined(WEBP_USE_SSE2) 17 #include <emmintrin.h> 18 19 //------------------------------------------------------------------------------ 20 21 static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride, 22 int width, int height, 23 uint8_t* dst, int dst_stride) { 24 // alpha_and stores an 'and' operation of all the alpha[] values. The final 25 // value is not 0xff if any of the alpha[] is not equal to 0xff. 26 uint32_t alpha_and = 0xff; 27 int i, j; 28 const __m128i zero = _mm_setzero_si128(); 29 const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u); // to preserve RGB 30 const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); 31 __m128i all_alphas = all_0xff; 32 33 // We must be able to access 3 extra bytes after the last written byte 34 // 'dst[4 * width - 4]', because we don't know if alpha is the first or the 35 // last byte of the quadruplet. 36 const int limit = (width - 1) & ~7; 37 38 for (j = 0; j < height; ++j) { 39 __m128i* out = (__m128i*)dst; 40 for (i = 0; i < limit; i += 8) { 41 // load 8 alpha bytes 42 const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]); 43 const __m128i a1 = _mm_unpacklo_epi8(a0, zero); 44 const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); 45 const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); 46 // load 8 dst pixels (32 bytes) 47 const __m128i b0_lo = _mm_loadu_si128(out + 0); 48 const __m128i b0_hi = _mm_loadu_si128(out + 1); 49 // mask dst alpha values 50 const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask); 51 const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask); 52 // combine 53 const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo); 54 const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi); 55 // store 56 _mm_storeu_si128(out + 0, b2_lo); 57 _mm_storeu_si128(out + 1, b2_hi); 58 // accumulate eight alpha 'and' in parallel 59 all_alphas = _mm_and_si128(all_alphas, a0); 60 out += 2; 61 } 62 for (; i < width; ++i) { 63 const uint32_t alpha_value = alpha[i]; 64 dst[4 * i] = alpha_value; 65 alpha_and &= alpha_value; 66 } 67 alpha += alpha_stride; 68 dst += dst_stride; 69 } 70 // Combine the eight alpha 'and' into a 8-bit mask. 71 alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); 72 return (alpha_and != 0xff); 73 } 74 75 static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride, 76 int width, int height, 77 uint32_t* dst, int dst_stride) { 78 int i, j; 79 const __m128i zero = _mm_setzero_si128(); 80 const int limit = width & ~15; 81 for (j = 0; j < height; ++j) { 82 for (i = 0; i < limit; i += 16) { // process 16 alpha bytes 83 const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]); 84 const __m128i a1 = _mm_unpacklo_epi8(zero, a0); // note the 'zero' first! 85 const __m128i b1 = _mm_unpackhi_epi8(zero, a0); 86 const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); 87 const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero); 88 const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); 89 const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero); 90 _mm_storeu_si128((__m128i*)&dst[i + 0], a2_lo); 91 _mm_storeu_si128((__m128i*)&dst[i + 4], a2_hi); 92 _mm_storeu_si128((__m128i*)&dst[i + 8], b2_lo); 93 _mm_storeu_si128((__m128i*)&dst[i + 12], b2_hi); 94 } 95 for (; i < width; ++i) dst[i] = alpha[i] << 8; 96 alpha += alpha_stride; 97 dst += dst_stride; 98 } 99 } 100 101 static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride, 102 int width, int height, 103 uint8_t* alpha, int alpha_stride) { 104 // alpha_and stores an 'and' operation of all the alpha[] values. The final 105 // value is not 0xff if any of the alpha[] is not equal to 0xff. 106 uint32_t alpha_and = 0xff; 107 int i, j; 108 const __m128i a_mask = _mm_set1_epi32(0xffu); // to preserve alpha 109 const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); 110 __m128i all_alphas = all_0xff; 111 112 // We must be able to access 3 extra bytes after the last written byte 113 // 'src[4 * width - 4]', because we don't know if alpha is the first or the 114 // last byte of the quadruplet. 115 const int limit = (width - 1) & ~7; 116 117 for (j = 0; j < height; ++j) { 118 const __m128i* src = (const __m128i*)argb; 119 for (i = 0; i < limit; i += 8) { 120 // load 32 argb bytes 121 const __m128i a0 = _mm_loadu_si128(src + 0); 122 const __m128i a1 = _mm_loadu_si128(src + 1); 123 const __m128i b0 = _mm_and_si128(a0, a_mask); 124 const __m128i b1 = _mm_and_si128(a1, a_mask); 125 const __m128i c0 = _mm_packs_epi32(b0, b1); 126 const __m128i d0 = _mm_packus_epi16(c0, c0); 127 // store 128 _mm_storel_epi64((__m128i*)&alpha[i], d0); 129 // accumulate eight alpha 'and' in parallel 130 all_alphas = _mm_and_si128(all_alphas, d0); 131 src += 2; 132 } 133 for (; i < width; ++i) { 134 const uint32_t alpha_value = argb[4 * i]; 135 alpha[i] = alpha_value; 136 alpha_and &= alpha_value; 137 } 138 argb += argb_stride; 139 alpha += alpha_stride; 140 } 141 // Combine the eight alpha 'and' into a 8-bit mask. 142 alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); 143 return (alpha_and == 0xff); 144 } 145 146 //------------------------------------------------------------------------------ 147 // Non-dither premultiplied modes 148 149 #define MULTIPLIER(a) ((a) * 0x8081) 150 #define PREMULTIPLY(x, m) (((x) * (m)) >> 23) 151 152 // We can't use a 'const int' for the SHUFFLE value, because it has to be an 153 // immediate in the _mm_shufflexx_epi16() instruction. We really need a macro. 154 // We use: v / 255 = (v * 0x8081) >> 23, where v = alpha * {r,g,b} is a 16bit 155 // value. 156 #define APPLY_ALPHA(RGBX, SHUFFLE) do { \ 157 const __m128i argb0 = _mm_loadu_si128((const __m128i*)&(RGBX)); \ 158 const __m128i argb1_lo = _mm_unpacklo_epi8(argb0, zero); \ 159 const __m128i argb1_hi = _mm_unpackhi_epi8(argb0, zero); \ 160 const __m128i alpha0_lo = _mm_or_si128(argb1_lo, kMask); \ 161 const __m128i alpha0_hi = _mm_or_si128(argb1_hi, kMask); \ 162 const __m128i alpha1_lo = _mm_shufflelo_epi16(alpha0_lo, SHUFFLE); \ 163 const __m128i alpha1_hi = _mm_shufflelo_epi16(alpha0_hi, SHUFFLE); \ 164 const __m128i alpha2_lo = _mm_shufflehi_epi16(alpha1_lo, SHUFFLE); \ 165 const __m128i alpha2_hi = _mm_shufflehi_epi16(alpha1_hi, SHUFFLE); \ 166 /* alpha2 = [ff a0 a0 a0][ff a1 a1 a1] */ \ 167 const __m128i A0_lo = _mm_mullo_epi16(alpha2_lo, argb1_lo); \ 168 const __m128i A0_hi = _mm_mullo_epi16(alpha2_hi, argb1_hi); \ 169 const __m128i A1_lo = _mm_mulhi_epu16(A0_lo, kMult); \ 170 const __m128i A1_hi = _mm_mulhi_epu16(A0_hi, kMult); \ 171 const __m128i A2_lo = _mm_srli_epi16(A1_lo, 7); \ 172 const __m128i A2_hi = _mm_srli_epi16(A1_hi, 7); \ 173 const __m128i A3 = _mm_packus_epi16(A2_lo, A2_hi); \ 174 _mm_storeu_si128((__m128i*)&(RGBX), A3); \ 175 } while (0) 176 177 static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first, 178 int w, int h, int stride) { 179 const __m128i zero = _mm_setzero_si128(); 180 const __m128i kMult = _mm_set1_epi16(0x8081u); 181 const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0); 182 const int kSpan = 4; 183 while (h-- > 0) { 184 uint32_t* const rgbx = (uint32_t*)rgba; 185 int i; 186 if (!alpha_first) { 187 for (i = 0; i + kSpan <= w; i += kSpan) { 188 APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(2, 3, 3, 3)); 189 } 190 } else { 191 for (i = 0; i + kSpan <= w; i += kSpan) { 192 APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 1)); 193 } 194 } 195 // Finish with left-overs. 196 for (; i < w; ++i) { 197 uint8_t* const rgb = rgba + (alpha_first ? 1 : 0); 198 const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3); 199 const uint32_t a = alpha[4 * i]; 200 if (a != 0xff) { 201 const uint32_t mult = MULTIPLIER(a); 202 rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult); 203 rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult); 204 rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult); 205 } 206 } 207 rgba += stride; 208 } 209 } 210 #undef MULTIPLIER 211 #undef PREMULTIPLY 212 213 //------------------------------------------------------------------------------ 214 // Alpha detection 215 216 static int HasAlpha8b_SSE2(const uint8_t* src, int length) { 217 const __m128i all_0xff = _mm_set1_epi8(0xff); 218 int i = 0; 219 for (; i + 16 <= length; i += 16) { 220 const __m128i v = _mm_loadu_si128((const __m128i*)(src + i)); 221 const __m128i bits = _mm_cmpeq_epi8(v, all_0xff); 222 const int mask = _mm_movemask_epi8(bits); 223 if (mask != 0xffff) return 1; 224 } 225 for (; i < length; ++i) if (src[i] != 0xff) return 1; 226 return 0; 227 } 228 229 static int HasAlpha32b_SSE2(const uint8_t* src, int length) { 230 const __m128i alpha_mask = _mm_set1_epi32(0xff); 231 const __m128i all_0xff = _mm_set1_epi8(0xff); 232 int i = 0; 233 // We don't know if we can access the last 3 bytes after the last alpha 234 // value 'src[4 * length - 4]' (because we don't know if alpha is the first 235 // or the last byte of the quadruplet). Hence the '-3' protection below. 236 length = length * 4 - 3; // size in bytes 237 for (; i + 64 <= length; i += 64) { 238 const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0)); 239 const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16)); 240 const __m128i a2 = _mm_loadu_si128((const __m128i*)(src + i + 32)); 241 const __m128i a3 = _mm_loadu_si128((const __m128i*)(src + i + 48)); 242 const __m128i b0 = _mm_and_si128(a0, alpha_mask); 243 const __m128i b1 = _mm_and_si128(a1, alpha_mask); 244 const __m128i b2 = _mm_and_si128(a2, alpha_mask); 245 const __m128i b3 = _mm_and_si128(a3, alpha_mask); 246 const __m128i c0 = _mm_packs_epi32(b0, b1); 247 const __m128i c1 = _mm_packs_epi32(b2, b3); 248 const __m128i d = _mm_packus_epi16(c0, c1); 249 const __m128i bits = _mm_cmpeq_epi8(d, all_0xff); 250 const int mask = _mm_movemask_epi8(bits); 251 if (mask != 0xffff) return 1; 252 } 253 for (; i + 32 <= length; i += 32) { 254 const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0)); 255 const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16)); 256 const __m128i b0 = _mm_and_si128(a0, alpha_mask); 257 const __m128i b1 = _mm_and_si128(a1, alpha_mask); 258 const __m128i c = _mm_packs_epi32(b0, b1); 259 const __m128i d = _mm_packus_epi16(c, c); 260 const __m128i bits = _mm_cmpeq_epi8(d, all_0xff); 261 const int mask = _mm_movemask_epi8(bits); 262 if (mask != 0xffff) return 1; 263 } 264 for (; i <= length; i += 4) if (src[i] != 0xff) return 1; 265 return 0; 266 } 267 268 // ----------------------------------------------------------------------------- 269 // Apply alpha value to rows 270 271 static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) { 272 int x = 0; 273 if (!inverse) { 274 const int kSpan = 2; 275 const __m128i zero = _mm_setzero_si128(); 276 const __m128i k128 = _mm_set1_epi16(128); 277 const __m128i kMult = _mm_set1_epi16(0x0101); 278 const __m128i kMask = _mm_set_epi16(0, 0xff, 0, 0, 0, 0xff, 0, 0); 279 for (x = 0; x + kSpan <= width; x += kSpan) { 280 // To compute 'result = (int)(a * x / 255. + .5)', we use: 281 // tmp = a * v + 128, result = (tmp * 0x0101u) >> 16 282 const __m128i A0 = _mm_loadl_epi64((const __m128i*)&ptr[x]); 283 const __m128i A1 = _mm_unpacklo_epi8(A0, zero); 284 const __m128i A2 = _mm_or_si128(A1, kMask); 285 const __m128i A3 = _mm_shufflelo_epi16(A2, _MM_SHUFFLE(2, 3, 3, 3)); 286 const __m128i A4 = _mm_shufflehi_epi16(A3, _MM_SHUFFLE(2, 3, 3, 3)); 287 // here, A4 = [ff a0 a0 a0][ff a1 a1 a1] 288 const __m128i A5 = _mm_mullo_epi16(A4, A1); 289 const __m128i A6 = _mm_add_epi16(A5, k128); 290 const __m128i A7 = _mm_mulhi_epu16(A6, kMult); 291 const __m128i A10 = _mm_packus_epi16(A7, zero); 292 _mm_storel_epi64((__m128i*)&ptr[x], A10); 293 } 294 } 295 width -= x; 296 if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse); 297 } 298 299 static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha, 300 int width, int inverse) { 301 int x = 0; 302 if (!inverse) { 303 const __m128i zero = _mm_setzero_si128(); 304 const __m128i k128 = _mm_set1_epi16(128); 305 const __m128i kMult = _mm_set1_epi16(0x0101); 306 for (x = 0; x + 8 <= width; x += 8) { 307 const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]); 308 const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[x]); 309 const __m128i v1 = _mm_unpacklo_epi8(v0, zero); 310 const __m128i a1 = _mm_unpacklo_epi8(a0, zero); 311 const __m128i v2 = _mm_mullo_epi16(v1, a1); 312 const __m128i v3 = _mm_add_epi16(v2, k128); 313 const __m128i v4 = _mm_mulhi_epu16(v3, kMult); 314 const __m128i v5 = _mm_packus_epi16(v4, zero); 315 _mm_storel_epi64((__m128i*)&ptr[x], v5); 316 } 317 } 318 width -= x; 319 if (width > 0) WebPMultRow_C(ptr + x, alpha + x, width, inverse); 320 } 321 322 //------------------------------------------------------------------------------ 323 // Entry point 324 325 extern void WebPInitAlphaProcessingSSE2(void); 326 327 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) { 328 WebPMultARGBRow = MultARGBRow_SSE2; 329 WebPMultRow = MultRow_SSE2; 330 WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2; 331 WebPDispatchAlpha = DispatchAlpha_SSE2; 332 WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2; 333 WebPExtractAlpha = ExtractAlpha_SSE2; 334 335 WebPHasAlpha8b = HasAlpha8b_SSE2; 336 WebPHasAlpha32b = HasAlpha32b_SSE2; 337 } 338 339 #else // !WEBP_USE_SSE2 340 341 WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2) 342 343 #endif // WEBP_USE_SSE2 344