Home | History | Annotate | Download | only in simd
      1 ; Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 ; Use of this source code is governed by a BSD-style license that can be
      3 ; found in the LICENSE file.
      4 
      5 %include "media/base/simd/media_export.asm"
      6 %include "third_party/x86inc/x86inc.asm"
      7 
      8 ;
      9 ; This file uses SSE, SSE2, SSE3, and SSSE3, which are supported by all ATOM
     10 ; processors.
     11 ;
     12   SECTION_TEXT
     13   CPU       SSE, SSE3, SSE3, SSSE3
     14 
     15 ;
     16 ; XMM registers representing constants. We must not use these registers as
     17 ; destination operands.
     18 ; for (int i = 0; i < 16; i += 4) {
     19 ;   xmm7.b[i] = 25;  xmm7.b[i+1] = 2;   xmm7.b[i+2] = 66;  xmm7.b[i+3] = 0;
     20 ;   xmm6.b[i] = 0;   xmm6.b[i+1] = 127; xmm6.b[i+2] = 0;   xmm6.b[i+3] = 0;
     21 ;   xmm5.b[i] = 112; xmm5.b[i+1] = -74; xmm5.b[i+2] = -38; xmm5.b[i+3] = 0;
     22 ;   xmm4.b[i] = -18; xmm4.b[i+1] = -94; xmm4.b[i+2] = 112; xmm4.b[i+3] = 0;
     23 ; }
     24 ;
     25 %define XMM_CONST_Y0    xmm7
     26 %define XMM_CONST_Y1    xmm6
     27 %define XMM_CONST_U     xmm5
     28 %define XMM_CONST_V     xmm4
     29 %define XMM_CONST_128   xmm3
     30 
     31 ;
     32 ; LOAD_XMM %1 (xmm), %2 (imm32)
     33 ; Loads an immediate value to an XMM register.
     34 ;   %1.d[0] = %1.d[1] =  %1.d[2] =  %1.d[3] = %2;
     35 ;
     36 %macro LOAD_XMM 2
     37   mov       TEMPd, %2
     38   movd      %1, TEMPd
     39   pshufd    %1, %1, 00000000B
     40 %endmacro
     41 
     42 ;
     43 ; UNPACKRGB %1 (xmm), %2 (imm8)
     44 ; Unpacks one RGB pixel in the specified XMM register.
     45 ;   for (int i = 15; i > %2; --i) %1.b[i] = %1.b[i - 1];
     46 ;   %1.b[%2] = 0;
     47 ;   for (int i = %2 - 1; i >= 0; --i) %1.b[i] = %1.b[i];
     48 ;
     49 %macro UNPACKRGB 2
     50   movdqa    xmm1, %1
     51   psrldq    xmm1, %2
     52   pslldq    xmm1, %2
     53   pxor      %1, xmm1
     54   pslldq    xmm1, 1
     55   por       %1, xmm1
     56 %endmacro
     57 
     58 ;
     59 ; READ_ARGB %1 (xmm), %2 (imm)
     60 ; Read the specified number of ARGB (or RGB) pixels from the source and store
     61 ; them to the destination xmm register. If the input format is RGB, we read RGB
     62 ; pixels and convert them to ARGB pixels. (For this case, the alpha values of
     63 ; the output pixels become 0.)
     64 ;
     65 %macro READ_ARGB 2
     66 
     67 %if PIXELSIZE == 4
     68 
     69   ; Read ARGB pixels from the source. (This macro assumes the input buffer may
     70   ; not be aligned to a 16-byte boundary.)
     71 %if %2 == 1
     72   movd      %1, DWORD [ARGBq + WIDTHq * 4 * 2]
     73 %elif %2 == 2
     74   movq      %1, QWORD [ARGBq + WIDTHq * 4 * 2]
     75 %elif %2 == 4
     76   movdqu    %1, DQWORD [ARGBq + WIDTHq * 4 * 2]
     77 %else
     78 %error unsupported number of pixels.
     79 %endif
     80 
     81 %elif PIXELSIZE == 3
     82 
     83   ; Read RGB pixels from the source and convert them to ARGB pixels.
     84 %if %2 == 1
     85   ; Read one RGB pixel and convert it to one ARGB pixel.
     86   ; Save the WIDTH register to xmm1. (This macro needs to break it.)
     87   MOVq      xmm1, WIDTHq
     88 
     89   ; Once read three bytes from the source to TEMPd, and copy it to the
     90   ; destination xmm register.
     91   lea       WIDTHq, [WIDTHq + WIDTHq * 2]
     92   movzx     TEMPd, BYTE [ARGBq + WIDTHq * 2 + 2]
     93   shl       TEMPd, 16
     94   mov       TEMPw, WORD [ARGBq + WIDTHq * 2]
     95   movd      %1, TEMPd
     96 
     97   ; Restore the WIDTH register.
     98   MOVq      WIDTHq, xmm1
     99 %elif %2 == 2
    100   ; Read two RGB pixels and convert them to two ARGB pixels.
    101   ; Read six bytes from the source to the destination xmm register.
    102   mov       TEMPq, WIDTHq
    103   lea       TEMPq, [TEMPq + TEMPq * 2]
    104   movd      %1, DWORD [ARGBq + TEMPq * 2]
    105   pinsrw    %1, WORD [ARGBq + TEMPq * 2 + 4], 3
    106 
    107   ; Fill the alpha values of these RGB pixels with 0 and convert them to two
    108   ; ARGB pixels.
    109   UNPACKRGB %1, 3
    110 %elif %2 == 4
    111   ; Read four RGB pixels and convert them to four ARGB pixels.
    112   ; Read twelve bytes from the source to the destination xmm register.
    113   mov       TEMPq, WIDTHq
    114   lea       TEMPq, [TEMPq + TEMPq * 2]
    115   movq      %1, QWORD [ARGBq + TEMPq * 2]
    116   movd      xmm1, DWORD [ARGBq + TEMPq * 2 + 8]
    117   shufps    %1, xmm1, 01000100B
    118 
    119   ; Fill the alpha values of these RGB pixels with 0 and convert them to four
    120   ; ARGB pixels.
    121   UNPACKRGB %1, 3
    122   UNPACKRGB %1, 4 + 3
    123   UNPACKRGB %1, 4 + 4 + 3
    124 %else
    125 %error unsupported number of pixels.
    126 %endif
    127 
    128 %else
    129 %error unsupported PIXELSIZE value.
    130 %endif
    131 
    132 %endmacro
    133 
    134 ;
    135 ; CALC_Y %1 (xmm), %2 (xmm)
    136 ; Calculates four Y values from four ARGB pixels stored in %2.
    137 ;   %1.b[0] = ToByte((25 * B(0) + 129 * G(0) + 66 * R(0) + 128) / 256 + 16);
    138 ;   %1.b[1] = ToByte((25 * B(1) + 129 * G(1) + 66 * R(1) + 128) / 256 + 16);
    139 ;   %1.b[2] = ToByte((25 * B(2) + 129 * G(2) + 66 * R(2) + 128) / 256 + 16);
    140 ;   %1.b[3] = ToByte((25 * B(3) + 129 * G(3) + 66 * R(3) + 128) / 256 + 16);
    141 ;
    142 %macro CALC_Y 2
    143   ; To avoid signed saturation, we divide this conversion formula into two
    144   ; formulae and store their results into two XMM registers %1 and xmm2.
    145   ; %1.w[0]   = 25  * %2.b[0]  + 2   * %2.b[1]  + 66  * %2.b[2]  + 0 * %2.b[3];
    146   ; %1.w[1]   = 25  * %2.b[4]  + 2   * %2.b[5]  + 66  * %2.b[6]  + 0 * %2.b[7];
    147   ; %1.w[2]   = 25  * %2.b[8]  + 2   * %2.b[9]  + 66  * %2.b[10] + 0 * %2.b[11];
    148   ; %1.w[3]   = 25  * %2.b[12] + 2   * %2.b[13] + 66  * %2.b[14] + 0 * %2.b[15];
    149   ; xmm2.w[0] = 0   * %2.b[0]  + 127 * %2.b[1]  + 0   * %2.b[2]  + 0 * %2.b[3];
    150   ; xmm2.w[1] = 0   * %2.b[4]  + 127 * %2.b[5]  + 0   * %2.b[6]  + 0 * %2.b[7];
    151   ; xmm2.w[2] = 0   * %2.b[8]  + 127 * %2.b[9]  + 0   * %2.b[10] + 0 * %2.b[11];
    152   ; xmm2.w[3] = 0   * %2.b[12] + 127 * %2.b[13] + 0   * %2.b[14] + 0 * %2.b[15];
    153   movdqa    %1, %2
    154   pmaddubsw %1, XMM_CONST_Y0
    155   phaddsw   %1, %1
    156   movdqa    xmm2, %2
    157   pmaddubsw xmm2, XMM_CONST_Y1
    158   phaddsw   xmm2, xmm2
    159 
    160   ; %1.b[0] = ToByte((%1.w[0] + xmm2.w[0] + 128) / 256 + 16);
    161   ; %1.b[1] = ToByte((%1.w[1] + xmm2.w[1] + 128) / 256 + 16);
    162   ; %1.b[2] = ToByte((%1.w[2] + xmm2.w[2] + 128) / 256 + 16);
    163   ; %1.b[3] = ToByte((%1.w[3] + xmm2.w[3] + 128) / 256 + 16);
    164   paddw     %1, xmm2
    165   movdqa    xmm2, XMM_CONST_128
    166   paddw     %1, xmm2
    167   psrlw     %1, 8
    168   psrlw     xmm2, 3
    169   paddw     %1, xmm2
    170   packuswb  %1, %1
    171 %endmacro
    172 
    173 ;
    174 ; INIT_UV %1 (r32), %2 (reg) %3 (imm)
    175 ;
    176 %macro INIT_UV 3
    177 
    178 %if SUBSAMPLING == 1 && LINE == 1
    179 %if %3 == 1 || %3 == 2
    180   movzx     %1, BYTE [%2 + WIDTHq]
    181 %elif %3 == 4
    182   movzx     %1, WORD [%2 + WIDTHq]
    183 %else
    184 %error unsupported number of pixels.
    185 %endif
    186 %endif
    187 
    188 %endmacro
    189 
    190 ;
    191 ; CALC_UV %1 (xmm), %2 (xmm), %3 (xmm), %4 (r32)
    192 ; Calculates two U (or V) values from four ARGB pixels stored in %2.
    193 ; if %3 == XMM_CONST_U
    194 ; if (SUBSAMPLING) {
    195 ;   %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128);
    196 ;   %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128);
    197 ;   %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128);
    198 ;   %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128);
    199 ; } else {
    200 ;   %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128);
    201 ;   %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128);
    202 ; }
    203 ; if %3 == XMM_CONST_V
    204 ;   %1.b[0] = ToByte((-18 * B(0) - 94 * G(0) + 112 * R(0) + 128) / 256 + 128);
    205 ;   %1.b[1] = ToByte((-18 * B(2) - 94 * G(2) + 112 * R(2) + 128) / 256 + 128);
    206 ;
    207 %macro CALC_UV 4
    208   ; for (int i = 0; i < 4; ++i) {
    209   ;   %1.w[i] = 0;
    210   ;   for (int j = 0; j < 4; ++j)
    211   ;     %1.w[i] += %3.b[i * 4 + j] + %2.b[i * 4 + j];
    212   ; }
    213   movdqa    %1, %2
    214   pmaddubsw %1, %3
    215   phaddsw   %1, %1
    216 
    217 %if SUBSAMPLING == 1
    218   ; %1.w[0] = (%1.w[0] + %1.w[1] + 1) / 2;
    219   ; %1.w[1] = (%1.w[1] + %1.w[0] + 1) / 2;
    220   ; %1.w[2] = (%1.w[2] + %1.w[3] + 1) / 2;
    221   ; %1.w[3] = (%1.w[3] + %1.w[2] + 1) / 2;
    222   pshuflw   xmm2, %1, 10110001B
    223   pavgw     %1, xmm2
    224 %endif
    225 
    226   ; %1.b[0] = ToByte((%1.w[0] + 128) / 256 + 128);
    227   ; %1.b[1] = ToByte((%1.w[2] + 128) / 256 + 128);
    228   pshuflw   %1, %1, 10001000B
    229   paddw     %1, XMM_CONST_128
    230   psraw     %1, 8
    231   paddw     %1, XMM_CONST_128
    232   packuswb  %1, %1
    233 
    234 %if SUBSAMPLING == 1 && LINE == 1
    235   ; %1.b[0] = (%1.b[0] + %3.b[0] + 1) / 2;
    236   ; %1.b[1] = (%1.b[1] + %3.b[1] + 1) / 2;
    237   movd      xmm2, %4
    238   pavgb     %1, xmm2
    239 %endif
    240 %endmacro
    241 
    242 ;
    243 ; extern "C" void ConvertARGBToYUVRow_SSSE3(const uint8* argb,
    244 ;                                           uint8* y,
    245 ;                                           uint8* u,
    246 ;                                           uint8* v,
    247 ;                                           ptrdiff_t width);
    248 ;
    249 %define SYMBOL          ConvertARGBToYUVRow_SSSE3
    250 %define PIXELSIZE       4
    251 %define SUBSAMPLING     0
    252 %define LINE            0
    253 %include "convert_rgb_to_yuv_ssse3.inc"
    254 
    255 ;
    256 ; extern "C" void ConvertRGBToYUVRow_SSSE3(const uint8* rgb,
    257 ;                                          uint8* y,
    258 ;                                          uint8* u,
    259 ;                                          uint8* v,
    260 ;                                          ptrdiff_t width);
    261 ;
    262 %define SYMBOL          ConvertRGBToYUVRow_SSSE3
    263 %define PIXELSIZE       3
    264 %define SUBSAMPLING     0
    265 %define LINE            0
    266 %include "convert_rgb_to_yuv_ssse3.inc"
    267 
    268 ;
    269 ; extern "C" void ConvertARGBToYUVEven_SSSE3(const uint8* argb,
    270 ;                                            uint8* y,
    271 ;                                            uint8* u,
    272 ;                                            uint8* v,
    273 ;                                            ptrdiff_t width);
    274 ;
    275 %define SYMBOL          ConvertARGBToYUVEven_SSSE3
    276 %define PIXELSIZE       4
    277 %define SUBSAMPLING     1
    278 %define LINE            0
    279 %include "convert_rgb_to_yuv_ssse3.inc"
    280 
    281 ;
    282 ; extern "C" void ConvertARGBToYUVOdd_SSSE3(const uint8* argb,
    283 ;                                           uint8* y,
    284 ;                                           uint8* u,
    285 ;                                           uint8* v,
    286 ;                                           ptrdiff_t width);
    287 ;
    288 %define SYMBOL          ConvertARGBToYUVOdd_SSSE3
    289 %define PIXELSIZE       4
    290 %define SUBSAMPLING     1
    291 %define LINE            1
    292 %include "convert_rgb_to_yuv_ssse3.inc"
    293 
    294 ;
    295 ; extern "C" void ConvertRGBToYUVEven_SSSE3(const uint8* rgb,
    296 ;                                           uint8* y,
    297 ;                                           uint8* u,
    298 ;                                           uint8* v,
    299 ;                                           ptrdiff_t width);
    300 ;
    301 %define SYMBOL          ConvertRGBToYUVEven_SSSE3
    302 %define PIXELSIZE       3
    303 %define SUBSAMPLING     1
    304 %define LINE            0
    305 %include "convert_rgb_to_yuv_ssse3.inc"
    306 
    307 ;
    308 ; extern "C" void ConvertRGBToYUVOdd_SSSE3(const uint8* rgb,
    309 ;                                          uint8* y,
    310 ;                                          uint8* u,
    311 ;                                          uint8* v,
    312 ;                                          ptrdiff_t width);
    313 ;
    314 %define SYMBOL          ConvertRGBToYUVOdd_SSSE3
    315 %define PIXELSIZE       3
    316 %define SUBSAMPLING     1
    317 %define LINE            1
    318 %include "convert_rgb_to_yuv_ssse3.inc"
    319