1 ; Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 ; Use of this source code is governed by a BSD-style license that can be 3 ; found in the LICENSE file. 4 5 %include "media/base/simd/media_export.asm" 6 %include "third_party/x86inc/x86inc.asm" 7 8 ; 9 ; This file uses SSE, SSE2, SSE3, and SSSE3, which are supported by all ATOM 10 ; processors. 11 ; 12 SECTION_TEXT 13 CPU SSE, SSE3, SSE3, SSSE3 14 15 ; 16 ; XMM registers representing constants. We must not use these registers as 17 ; destination operands. 18 ; for (int i = 0; i < 16; i += 4) { 19 ; xmm7.b[i] = 25; xmm7.b[i+1] = 2; xmm7.b[i+2] = 66; xmm7.b[i+3] = 0; 20 ; xmm6.b[i] = 0; xmm6.b[i+1] = 127; xmm6.b[i+2] = 0; xmm6.b[i+3] = 0; 21 ; xmm5.b[i] = 112; xmm5.b[i+1] = -74; xmm5.b[i+2] = -38; xmm5.b[i+3] = 0; 22 ; xmm4.b[i] = -18; xmm4.b[i+1] = -94; xmm4.b[i+2] = 112; xmm4.b[i+3] = 0; 23 ; } 24 ; 25 %define XMM_CONST_Y0 xmm7 26 %define XMM_CONST_Y1 xmm6 27 %define XMM_CONST_U xmm5 28 %define XMM_CONST_V xmm4 29 %define XMM_CONST_128 xmm3 30 31 ; 32 ; LOAD_XMM %1 (xmm), %2 (imm32) 33 ; Loads an immediate value to an XMM register. 34 ; %1.d[0] = %1.d[1] = %1.d[2] = %1.d[3] = %2; 35 ; 36 %macro LOAD_XMM 2 37 mov TEMPd, %2 38 movd %1, TEMPd 39 pshufd %1, %1, 00000000B 40 %endmacro 41 42 ; 43 ; UNPACKRGB %1 (xmm), %2 (imm8) 44 ; Unpacks one RGB pixel in the specified XMM register. 45 ; for (int i = 15; i > %2; --i) %1.b[i] = %1.b[i - 1]; 46 ; %1.b[%2] = 0; 47 ; for (int i = %2 - 1; i >= 0; --i) %1.b[i] = %1.b[i]; 48 ; 49 %macro UNPACKRGB 2 50 movdqa xmm1, %1 51 psrldq xmm1, %2 52 pslldq xmm1, %2 53 pxor %1, xmm1 54 pslldq xmm1, 1 55 por %1, xmm1 56 %endmacro 57 58 ; 59 ; READ_ARGB %1 (xmm), %2 (imm) 60 ; Read the specified number of ARGB (or RGB) pixels from the source and store 61 ; them to the destination xmm register. If the input format is RGB, we read RGB 62 ; pixels and convert them to ARGB pixels. (For this case, the alpha values of 63 ; the output pixels become 0.) 64 ; 65 %macro READ_ARGB 2 66 67 %if PIXELSIZE == 4 68 69 ; Read ARGB pixels from the source. (This macro assumes the input buffer may 70 ; not be aligned to a 16-byte boundary.) 71 %if %2 == 1 72 movd %1, DWORD [ARGBq + WIDTHq * 4 * 2] 73 %elif %2 == 2 74 movq %1, QWORD [ARGBq + WIDTHq * 4 * 2] 75 %elif %2 == 4 76 movdqu %1, DQWORD [ARGBq + WIDTHq * 4 * 2] 77 %else 78 %error unsupported number of pixels. 79 %endif 80 81 %elif PIXELSIZE == 3 82 83 ; Read RGB pixels from the source and convert them to ARGB pixels. 84 %if %2 == 1 85 ; Read one RGB pixel and convert it to one ARGB pixel. 86 ; Save the WIDTH register to xmm1. (This macro needs to break it.) 87 MOVq xmm1, WIDTHq 88 89 ; Once read three bytes from the source to TEMPd, and copy it to the 90 ; destination xmm register. 91 lea WIDTHq, [WIDTHq + WIDTHq * 2] 92 movzx TEMPd, BYTE [ARGBq + WIDTHq * 2 + 2] 93 shl TEMPd, 16 94 mov TEMPw, WORD [ARGBq + WIDTHq * 2] 95 movd %1, TEMPd 96 97 ; Restore the WIDTH register. 98 MOVq WIDTHq, xmm1 99 %elif %2 == 2 100 ; Read two RGB pixels and convert them to two ARGB pixels. 101 ; Read six bytes from the source to the destination xmm register. 102 mov TEMPq, WIDTHq 103 lea TEMPq, [TEMPq + TEMPq * 2] 104 movd %1, DWORD [ARGBq + TEMPq * 2] 105 pinsrw %1, WORD [ARGBq + TEMPq * 2 + 4], 3 106 107 ; Fill the alpha values of these RGB pixels with 0 and convert them to two 108 ; ARGB pixels. 109 UNPACKRGB %1, 3 110 %elif %2 == 4 111 ; Read four RGB pixels and convert them to four ARGB pixels. 112 ; Read twelve bytes from the source to the destination xmm register. 113 mov TEMPq, WIDTHq 114 lea TEMPq, [TEMPq + TEMPq * 2] 115 movq %1, QWORD [ARGBq + TEMPq * 2] 116 movd xmm1, DWORD [ARGBq + TEMPq * 2 + 8] 117 shufps %1, xmm1, 01000100B 118 119 ; Fill the alpha values of these RGB pixels with 0 and convert them to four 120 ; ARGB pixels. 121 UNPACKRGB %1, 3 122 UNPACKRGB %1, 4 + 3 123 UNPACKRGB %1, 4 + 4 + 3 124 %else 125 %error unsupported number of pixels. 126 %endif 127 128 %else 129 %error unsupported PIXELSIZE value. 130 %endif 131 132 %endmacro 133 134 ; 135 ; CALC_Y %1 (xmm), %2 (xmm) 136 ; Calculates four Y values from four ARGB pixels stored in %2. 137 ; %1.b[0] = ToByte((25 * B(0) + 129 * G(0) + 66 * R(0) + 128) / 256 + 16); 138 ; %1.b[1] = ToByte((25 * B(1) + 129 * G(1) + 66 * R(1) + 128) / 256 + 16); 139 ; %1.b[2] = ToByte((25 * B(2) + 129 * G(2) + 66 * R(2) + 128) / 256 + 16); 140 ; %1.b[3] = ToByte((25 * B(3) + 129 * G(3) + 66 * R(3) + 128) / 256 + 16); 141 ; 142 %macro CALC_Y 2 143 ; To avoid signed saturation, we divide this conversion formula into two 144 ; formulae and store their results into two XMM registers %1 and xmm2. 145 ; %1.w[0] = 25 * %2.b[0] + 2 * %2.b[1] + 66 * %2.b[2] + 0 * %2.b[3]; 146 ; %1.w[1] = 25 * %2.b[4] + 2 * %2.b[5] + 66 * %2.b[6] + 0 * %2.b[7]; 147 ; %1.w[2] = 25 * %2.b[8] + 2 * %2.b[9] + 66 * %2.b[10] + 0 * %2.b[11]; 148 ; %1.w[3] = 25 * %2.b[12] + 2 * %2.b[13] + 66 * %2.b[14] + 0 * %2.b[15]; 149 ; xmm2.w[0] = 0 * %2.b[0] + 127 * %2.b[1] + 0 * %2.b[2] + 0 * %2.b[3]; 150 ; xmm2.w[1] = 0 * %2.b[4] + 127 * %2.b[5] + 0 * %2.b[6] + 0 * %2.b[7]; 151 ; xmm2.w[2] = 0 * %2.b[8] + 127 * %2.b[9] + 0 * %2.b[10] + 0 * %2.b[11]; 152 ; xmm2.w[3] = 0 * %2.b[12] + 127 * %2.b[13] + 0 * %2.b[14] + 0 * %2.b[15]; 153 movdqa %1, %2 154 pmaddubsw %1, XMM_CONST_Y0 155 phaddsw %1, %1 156 movdqa xmm2, %2 157 pmaddubsw xmm2, XMM_CONST_Y1 158 phaddsw xmm2, xmm2 159 160 ; %1.b[0] = ToByte((%1.w[0] + xmm2.w[0] + 128) / 256 + 16); 161 ; %1.b[1] = ToByte((%1.w[1] + xmm2.w[1] + 128) / 256 + 16); 162 ; %1.b[2] = ToByte((%1.w[2] + xmm2.w[2] + 128) / 256 + 16); 163 ; %1.b[3] = ToByte((%1.w[3] + xmm2.w[3] + 128) / 256 + 16); 164 paddw %1, xmm2 165 movdqa xmm2, XMM_CONST_128 166 paddw %1, xmm2 167 psrlw %1, 8 168 psrlw xmm2, 3 169 paddw %1, xmm2 170 packuswb %1, %1 171 %endmacro 172 173 ; 174 ; INIT_UV %1 (r32), %2 (reg) %3 (imm) 175 ; 176 %macro INIT_UV 3 177 178 %if SUBSAMPLING == 1 && LINE == 1 179 %if %3 == 1 || %3 == 2 180 movzx %1, BYTE [%2 + WIDTHq] 181 %elif %3 == 4 182 movzx %1, WORD [%2 + WIDTHq] 183 %else 184 %error unsupported number of pixels. 185 %endif 186 %endif 187 188 %endmacro 189 190 ; 191 ; CALC_UV %1 (xmm), %2 (xmm), %3 (xmm), %4 (r32) 192 ; Calculates two U (or V) values from four ARGB pixels stored in %2. 193 ; if %3 == XMM_CONST_U 194 ; if (SUBSAMPLING) { 195 ; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); 196 ; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); 197 ; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); 198 ; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); 199 ; } else { 200 ; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); 201 ; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); 202 ; } 203 ; if %3 == XMM_CONST_V 204 ; %1.b[0] = ToByte((-18 * B(0) - 94 * G(0) + 112 * R(0) + 128) / 256 + 128); 205 ; %1.b[1] = ToByte((-18 * B(2) - 94 * G(2) + 112 * R(2) + 128) / 256 + 128); 206 ; 207 %macro CALC_UV 4 208 ; for (int i = 0; i < 4; ++i) { 209 ; %1.w[i] = 0; 210 ; for (int j = 0; j < 4; ++j) 211 ; %1.w[i] += %3.b[i * 4 + j] + %2.b[i * 4 + j]; 212 ; } 213 movdqa %1, %2 214 pmaddubsw %1, %3 215 phaddsw %1, %1 216 217 %if SUBSAMPLING == 1 218 ; %1.w[0] = (%1.w[0] + %1.w[1] + 1) / 2; 219 ; %1.w[1] = (%1.w[1] + %1.w[0] + 1) / 2; 220 ; %1.w[2] = (%1.w[2] + %1.w[3] + 1) / 2; 221 ; %1.w[3] = (%1.w[3] + %1.w[2] + 1) / 2; 222 pshuflw xmm2, %1, 10110001B 223 pavgw %1, xmm2 224 %endif 225 226 ; %1.b[0] = ToByte((%1.w[0] + 128) / 256 + 128); 227 ; %1.b[1] = ToByte((%1.w[2] + 128) / 256 + 128); 228 pshuflw %1, %1, 10001000B 229 paddw %1, XMM_CONST_128 230 psraw %1, 8 231 paddw %1, XMM_CONST_128 232 packuswb %1, %1 233 234 %if SUBSAMPLING == 1 && LINE == 1 235 ; %1.b[0] = (%1.b[0] + %3.b[0] + 1) / 2; 236 ; %1.b[1] = (%1.b[1] + %3.b[1] + 1) / 2; 237 movd xmm2, %4 238 pavgb %1, xmm2 239 %endif 240 %endmacro 241 242 ; 243 ; extern "C" void ConvertARGBToYUVRow_SSSE3(const uint8* argb, 244 ; uint8* y, 245 ; uint8* u, 246 ; uint8* v, 247 ; ptrdiff_t width); 248 ; 249 %define SYMBOL ConvertARGBToYUVRow_SSSE3 250 %define PIXELSIZE 4 251 %define SUBSAMPLING 0 252 %define LINE 0 253 %include "convert_rgb_to_yuv_ssse3.inc" 254 255 ; 256 ; extern "C" void ConvertRGBToYUVRow_SSSE3(const uint8* rgb, 257 ; uint8* y, 258 ; uint8* u, 259 ; uint8* v, 260 ; ptrdiff_t width); 261 ; 262 %define SYMBOL ConvertRGBToYUVRow_SSSE3 263 %define PIXELSIZE 3 264 %define SUBSAMPLING 0 265 %define LINE 0 266 %include "convert_rgb_to_yuv_ssse3.inc" 267 268 ; 269 ; extern "C" void ConvertARGBToYUVEven_SSSE3(const uint8* argb, 270 ; uint8* y, 271 ; uint8* u, 272 ; uint8* v, 273 ; ptrdiff_t width); 274 ; 275 %define SYMBOL ConvertARGBToYUVEven_SSSE3 276 %define PIXELSIZE 4 277 %define SUBSAMPLING 1 278 %define LINE 0 279 %include "convert_rgb_to_yuv_ssse3.inc" 280 281 ; 282 ; extern "C" void ConvertARGBToYUVOdd_SSSE3(const uint8* argb, 283 ; uint8* y, 284 ; uint8* u, 285 ; uint8* v, 286 ; ptrdiff_t width); 287 ; 288 %define SYMBOL ConvertARGBToYUVOdd_SSSE3 289 %define PIXELSIZE 4 290 %define SUBSAMPLING 1 291 %define LINE 1 292 %include "convert_rgb_to_yuv_ssse3.inc" 293 294 ; 295 ; extern "C" void ConvertRGBToYUVEven_SSSE3(const uint8* rgb, 296 ; uint8* y, 297 ; uint8* u, 298 ; uint8* v, 299 ; ptrdiff_t width); 300 ; 301 %define SYMBOL ConvertRGBToYUVEven_SSSE3 302 %define PIXELSIZE 3 303 %define SUBSAMPLING 1 304 %define LINE 0 305 %include "convert_rgb_to_yuv_ssse3.inc" 306 307 ; 308 ; extern "C" void ConvertRGBToYUVOdd_SSSE3(const uint8* rgb, 309 ; uint8* y, 310 ; uint8* u, 311 ; uint8* v, 312 ; ptrdiff_t width); 313 ; 314 %define SYMBOL ConvertRGBToYUVOdd_SSSE3 315 %define PIXELSIZE 3 316 %define SUBSAMPLING 1 317 %define LINE 1 318 %include "convert_rgb_to_yuv_ssse3.inc" 319