1 // Copyright 2016 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // SSE4 code common to several files. 11 // 12 // Author: Vincent Rabaud (vrabaud (at) google.com) 13 14 #ifndef WEBP_DSP_COMMON_SSE41_H_ 15 #define WEBP_DSP_COMMON_SSE41_H_ 16 17 #ifdef __cplusplus 18 extern "C" { 19 #endif 20 21 #if defined(WEBP_USE_SSE41) 22 #include <smmintrin.h> 23 24 //------------------------------------------------------------------------------ 25 // Channel mixing. 26 // Shuffles the input buffer as A0 0 0 A1 0 0 A2 ... 27 #define WEBP_SSE41_SHUFF(OUT, IN0, IN1) \ 28 OUT##0 = _mm_shuffle_epi8(*IN0, shuff0); \ 29 OUT##1 = _mm_shuffle_epi8(*IN0, shuff1); \ 30 OUT##2 = _mm_shuffle_epi8(*IN0, shuff2); \ 31 OUT##3 = _mm_shuffle_epi8(*IN1, shuff0); \ 32 OUT##4 = _mm_shuffle_epi8(*IN1, shuff1); \ 33 OUT##5 = _mm_shuffle_epi8(*IN1, shuff2); 34 35 // Pack the planar buffers 36 // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... 37 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... 38 static WEBP_INLINE void VP8PlanarTo24b_SSE41( 39 __m128i* const in0, __m128i* const in1, __m128i* const in2, 40 __m128i* const in3, __m128i* const in4, __m128i* const in5) { 41 __m128i R0, R1, R2, R3, R4, R5; 42 __m128i G0, G1, G2, G3, G4, G5; 43 __m128i B0, B1, B2, B3, B4, B5; 44 45 // Process R. 46 { 47 const __m128i shuff0 = _mm_set_epi8( 48 5, -1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0); 49 const __m128i shuff1 = _mm_set_epi8( 50 -1, 10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1); 51 const __m128i shuff2 = _mm_set_epi8( 52 -1, -1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1); 53 WEBP_SSE41_SHUFF(R, in0, in1) 54 } 55 56 // Process G. 57 { 58 // Same as before, just shifted to the left by one and including the right 59 // padding. 60 const __m128i shuff0 = _mm_set_epi8( 61 -1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1); 62 const __m128i shuff1 = _mm_set_epi8( 63 10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5); 64 const __m128i shuff2 = _mm_set_epi8( 65 -1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1); 66 WEBP_SSE41_SHUFF(G, in2, in3) 67 } 68 69 // Process B. 70 { 71 const __m128i shuff0 = _mm_set_epi8( 72 -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1, -1); 73 const __m128i shuff1 = _mm_set_epi8( 74 -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5, -1); 75 const __m128i shuff2 = _mm_set_epi8( 76 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1, 10); 77 WEBP_SSE41_SHUFF(B, in4, in5) 78 } 79 80 // OR the different channels. 81 { 82 const __m128i RG0 = _mm_or_si128(R0, G0); 83 const __m128i RG1 = _mm_or_si128(R1, G1); 84 const __m128i RG2 = _mm_or_si128(R2, G2); 85 const __m128i RG3 = _mm_or_si128(R3, G3); 86 const __m128i RG4 = _mm_or_si128(R4, G4); 87 const __m128i RG5 = _mm_or_si128(R5, G5); 88 *in0 = _mm_or_si128(RG0, B0); 89 *in1 = _mm_or_si128(RG1, B1); 90 *in2 = _mm_or_si128(RG2, B2); 91 *in3 = _mm_or_si128(RG3, B3); 92 *in4 = _mm_or_si128(RG4, B4); 93 *in5 = _mm_or_si128(RG5, B5); 94 } 95 } 96 97 #undef WEBP_SSE41_SHUFF 98 99 // Convert four packed four-channel buffers like argbargbargbargb... into the 100 // split channels aaaaa ... rrrr ... gggg .... bbbbb ...... 101 static WEBP_INLINE void VP8L32bToPlanar_SSE41(__m128i* const in0, 102 __m128i* const in1, 103 __m128i* const in2, 104 __m128i* const in3) { 105 // aaaarrrrggggbbbb 106 const __m128i shuff0 = 107 _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); 108 const __m128i A0 = _mm_shuffle_epi8(*in0, shuff0); 109 const __m128i A1 = _mm_shuffle_epi8(*in1, shuff0); 110 const __m128i A2 = _mm_shuffle_epi8(*in2, shuff0); 111 const __m128i A3 = _mm_shuffle_epi8(*in3, shuff0); 112 // A0A1R0R1 113 // G0G1B0B1 114 // A2A3R2R3 115 // G0G1B0B1 116 const __m128i B0 = _mm_unpacklo_epi32(A0, A1); 117 const __m128i B1 = _mm_unpackhi_epi32(A0, A1); 118 const __m128i B2 = _mm_unpacklo_epi32(A2, A3); 119 const __m128i B3 = _mm_unpackhi_epi32(A2, A3); 120 *in3 = _mm_unpacklo_epi64(B0, B2); 121 *in2 = _mm_unpackhi_epi64(B0, B2); 122 *in1 = _mm_unpacklo_epi64(B1, B3); 123 *in0 = _mm_unpackhi_epi64(B1, B3); 124 } 125 126 #endif // WEBP_USE_SSE41 127 128 #ifdef __cplusplus 129 } // extern "C" 130 #endif 131 132 #endif // WEBP_DSP_COMMON_SSE41_H_ 133