1 /* 2 ** 3 ** Copyright 2009, The Android Open Source Project 4 ** 5 ** Licensed under the Apache License, Version 2.0 (the "License"); 6 ** you may not use this file except in compliance with the License. 7 ** You may obtain a copy of the License at 8 ** 9 ** http://www.apache.org/licenses/LICENSE-2.0 10 ** 11 ** Unless required by applicable law or agreed to in writing, software 12 ** distributed under the License is distributed on an "AS IS" BASIS, 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 ** See the License for the specific language governing permissions and 15 ** limitations under the License. 16 */ 17 18 #include <emmintrin.h> 19 #include "SkBitmapProcState_opts_SSE2.h" 20 #include "SkUtils.h" 21 22 void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s, 23 const uint32_t* xy, 24 int count, uint32_t* colors) { 25 SkASSERT(count > 0 && colors != NULL); 26 SkASSERT(s.fDoFilter); 27 SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config); 28 SkASSERT(s.fAlphaScale == 256); 29 30 const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels()); 31 unsigned rb = s.fBitmap->rowBytes(); 32 uint32_t XY = *xy++; 33 unsigned y0 = XY >> 14; 34 const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb); 35 const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb); 36 unsigned subY = y0 & 0xF; 37 38 // ( 0, 0, 0, 0, 0, 0, 0, 16) 39 __m128i sixteen = _mm_cvtsi32_si128(16); 40 41 // ( 0, 0, 0, 0, 16, 16, 16, 16) 42 sixteen = _mm_shufflelo_epi16(sixteen, 0); 43 44 // ( 0, 0, 0, 0, 0, 0, 0, y) 45 __m128i allY = _mm_cvtsi32_si128(subY); 46 47 // ( 0, 0, 0, 0, y, y, y, y) 48 allY = _mm_shufflelo_epi16(allY, 0); 49 50 // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y) 51 __m128i negY = _mm_sub_epi16(sixteen, allY); 52 53 // (16-y, 16-y, 16-y, 16-y, y, y, y, y) 54 allY = _mm_unpacklo_epi64(allY, negY); 55 56 // (16, 16, 16, 16, 16, 16, 16, 16 ) 57 sixteen = _mm_shuffle_epi32(sixteen, 0); 58 59 // ( 0, 0, 0, 0, 0, 0, 0, 0) 60 __m128i zero = _mm_setzero_si128(); 61 do { 62 uint32_t XX = *xy++; // x0:14 | 4 | x1:14 63 unsigned x0 = XX >> 18; 64 unsigned x1 = XX & 0x3FFF; 65 66 // (0, 0, 0, 0, 0, 0, 0, x) 67 __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F); 68 69 // (0, 0, 0, 0, x, x, x, x) 70 allX = _mm_shufflelo_epi16(allX, 0); 71 72 // (x, x, x, x, x, x, x, x) 73 allX = _mm_shuffle_epi32(allX, 0); 74 75 // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x) 76 __m128i negX = _mm_sub_epi16(sixteen, allX); 77 78 // Load 4 samples (pixels). 79 __m128i a00 = _mm_cvtsi32_si128(row0[x0]); 80 __m128i a01 = _mm_cvtsi32_si128(row0[x1]); 81 __m128i a10 = _mm_cvtsi32_si128(row1[x0]); 82 __m128i a11 = _mm_cvtsi32_si128(row1[x1]); 83 84 // (0, 0, a00, a10) 85 __m128i a00a10 = _mm_unpacklo_epi32(a10, a00); 86 87 // Expand to 16 bits per component. 88 a00a10 = _mm_unpacklo_epi8(a00a10, zero); 89 90 // ((a00 * (16-y)), (a10 * y)). 91 a00a10 = _mm_mullo_epi16(a00a10, allY); 92 93 // (a00 * (16-y) * (16-x), a10 * y * (16-x)). 94 a00a10 = _mm_mullo_epi16(a00a10, negX); 95 96 // (0, 0, a01, a10) 97 __m128i a01a11 = _mm_unpacklo_epi32(a11, a01); 98 99 // Expand to 16 bits per component. 100 a01a11 = _mm_unpacklo_epi8(a01a11, zero); 101 102 // (a01 * (16-y)), (a11 * y) 103 a01a11 = _mm_mullo_epi16(a01a11, allY); 104 105 // (a01 * (16-y) * x), (a11 * y * x) 106 a01a11 = _mm_mullo_epi16(a01a11, allX); 107 108 // (a00*w00 + a01*w01, a10*w10 + a11*w11) 109 __m128i sum = _mm_add_epi16(a00a10, a01a11); 110 111 // (DC, a00*w00 + a01*w01) 112 __m128i shifted = _mm_shuffle_epi32(sum, 0xEE); 113 114 // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11) 115 sum = _mm_add_epi16(sum, shifted); 116 117 // Divide each 16 bit component by 256. 118 sum = _mm_srli_epi16(sum, 8); 119 120 // Pack lower 4 16 bit values of sum into lower 4 bytes. 121 sum = _mm_packus_epi16(sum, zero); 122 123 // Extract low int and store. 124 *colors++ = _mm_cvtsi128_si32(sum); 125 } while (--count > 0); 126 } 127 128 void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s, 129 const uint32_t* xy, 130 int count, uint32_t* colors) { 131 SkASSERT(count > 0 && colors != NULL); 132 SkASSERT(s.fDoFilter); 133 SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config); 134 SkASSERT(s.fAlphaScale < 256); 135 136 const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels()); 137 unsigned rb = s.fBitmap->rowBytes(); 138 uint32_t XY = *xy++; 139 unsigned y0 = XY >> 14; 140 const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb); 141 const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb); 142 unsigned subY = y0 & 0xF; 143 144 // ( 0, 0, 0, 0, 0, 0, 0, 16) 145 __m128i sixteen = _mm_cvtsi32_si128(16); 146 147 // ( 0, 0, 0, 0, 16, 16, 16, 16) 148 sixteen = _mm_shufflelo_epi16(sixteen, 0); 149 150 // ( 0, 0, 0, 0, 0, 0, 0, y) 151 __m128i allY = _mm_cvtsi32_si128(subY); 152 153 // ( 0, 0, 0, 0, y, y, y, y) 154 allY = _mm_shufflelo_epi16(allY, 0); 155 156 // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y) 157 __m128i negY = _mm_sub_epi16(sixteen, allY); 158 159 // (16-y, 16-y, 16-y, 16-y, y, y, y, y) 160 allY = _mm_unpacklo_epi64(allY, negY); 161 162 // (16, 16, 16, 16, 16, 16, 16, 16 ) 163 sixteen = _mm_shuffle_epi32(sixteen, 0); 164 165 // ( 0, 0, 0, 0, 0, 0, 0, 0) 166 __m128i zero = _mm_setzero_si128(); 167 168 // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha ) 169 __m128i alpha = _mm_set1_epi16(s.fAlphaScale); 170 171 do { 172 uint32_t XX = *xy++; // x0:14 | 4 | x1:14 173 unsigned x0 = XX >> 18; 174 unsigned x1 = XX & 0x3FFF; 175 176 // (0, 0, 0, 0, 0, 0, 0, x) 177 __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F); 178 179 // (0, 0, 0, 0, x, x, x, x) 180 allX = _mm_shufflelo_epi16(allX, 0); 181 182 // (x, x, x, x, x, x, x, x) 183 allX = _mm_shuffle_epi32(allX, 0); 184 185 // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x) 186 __m128i negX = _mm_sub_epi16(sixteen, allX); 187 188 // Load 4 samples (pixels). 189 __m128i a00 = _mm_cvtsi32_si128(row0[x0]); 190 __m128i a01 = _mm_cvtsi32_si128(row0[x1]); 191 __m128i a10 = _mm_cvtsi32_si128(row1[x0]); 192 __m128i a11 = _mm_cvtsi32_si128(row1[x1]); 193 194 // (0, 0, a00, a10) 195 __m128i a00a10 = _mm_unpacklo_epi32(a10, a00); 196 197 // Expand to 16 bits per component. 198 a00a10 = _mm_unpacklo_epi8(a00a10, zero); 199 200 // ((a00 * (16-y)), (a10 * y)). 201 a00a10 = _mm_mullo_epi16(a00a10, allY); 202 203 // (a00 * (16-y) * (16-x), a10 * y * (16-x)). 204 a00a10 = _mm_mullo_epi16(a00a10, negX); 205 206 // (0, 0, a01, a10) 207 __m128i a01a11 = _mm_unpacklo_epi32(a11, a01); 208 209 // Expand to 16 bits per component. 210 a01a11 = _mm_unpacklo_epi8(a01a11, zero); 211 212 // (a01 * (16-y)), (a11 * y) 213 a01a11 = _mm_mullo_epi16(a01a11, allY); 214 215 // (a01 * (16-y) * x), (a11 * y * x) 216 a01a11 = _mm_mullo_epi16(a01a11, allX); 217 218 // (a00*w00 + a01*w01, a10*w10 + a11*w11) 219 __m128i sum = _mm_add_epi16(a00a10, a01a11); 220 221 // (DC, a00*w00 + a01*w01) 222 __m128i shifted = _mm_shuffle_epi32(sum, 0xEE); 223 224 // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11) 225 sum = _mm_add_epi16(sum, shifted); 226 227 // Divide each 16 bit component by 256. 228 sum = _mm_srli_epi16(sum, 8); 229 230 // Multiply by alpha. 231 sum = _mm_mullo_epi16(sum, alpha); 232 233 // Divide each 16 bit component by 256. 234 sum = _mm_srli_epi16(sum, 8); 235 236 // Pack lower 4 16 bit values of sum into lower 4 bytes. 237 sum = _mm_packus_epi16(sum, zero); 238 239 // Extract low int and store. 240 *colors++ = _mm_cvtsi128_si32(sum); 241 } while (--count > 0); 242 } 243