Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %include "third_party/x86inc/x86inc.asm"
     12 
     13 SECTION_RODATA
     14 pw_4:  times 8 dw 4
     15 pw_8:  times 8 dw 8
     16 pw_16: times 8 dw 16
     17 pw_32: times 8 dw 32
     18 
     19 SECTION .text
     20 
     21 INIT_MMX sse
     22 cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
     23   GET_GOT     goffsetq
     24 
     25   pxor                  m1, m1
     26   movd                  m0, [aboveq]
     27   punpckldq             m0, [leftq]
     28   psadbw                m0, m1
     29   paddw                 m0, [GLOBAL(pw_4)]
     30   psraw                 m0, 3
     31   pshufw                m0, m0, 0x0
     32   packuswb              m0, m0
     33   movd      [dstq        ], m0
     34   movd      [dstq+strideq], m0
     35   lea                 dstq, [dstq+strideq*2]
     36   movd      [dstq        ], m0
     37   movd      [dstq+strideq], m0
     38 
     39   RESTORE_GOT
     40   RET
     41 
     42 INIT_MMX sse
     43 cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
     44   GET_GOT     goffsetq
     45 
     46   pxor                  m1, m1
     47   movq                  m0, [aboveq]
     48   movq                  m2, [leftq]
     49   DEFINE_ARGS dst, stride, stride3
     50   lea             stride3q, [strideq*3]
     51   psadbw                m0, m1
     52   psadbw                m2, m1
     53   paddw                 m0, m2
     54   paddw                 m0, [GLOBAL(pw_8)]
     55   psraw                 m0, 4
     56   pshufw                m0, m0, 0x0
     57   packuswb              m0, m0
     58   movq    [dstq          ], m0
     59   movq    [dstq+strideq  ], m0
     60   movq    [dstq+strideq*2], m0
     61   movq    [dstq+stride3q ], m0
     62   lea                 dstq, [dstq+strideq*4]
     63   movq    [dstq          ], m0
     64   movq    [dstq+strideq  ], m0
     65   movq    [dstq+strideq*2], m0
     66   movq    [dstq+stride3q ], m0
     67 
     68   RESTORE_GOT
     69   RET
     70 
     71 INIT_XMM sse2
     72 cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
     73   GET_GOT     goffsetq
     74 
     75   pxor                  m1, m1
     76   mova                  m0, [aboveq]
     77   mova                  m2, [leftq]
     78   DEFINE_ARGS dst, stride, stride3, lines4
     79   lea             stride3q, [strideq*3]
     80   mov              lines4d, 4
     81   psadbw                m0, m1
     82   psadbw                m2, m1
     83   paddw                 m0, m2
     84   movhlps               m2, m0
     85   paddw                 m0, m2
     86   paddw                 m0, [GLOBAL(pw_16)]
     87   psraw                 m0, 5
     88   pshuflw               m0, m0, 0x0
     89   punpcklqdq            m0, m0
     90   packuswb              m0, m0
     91 .loop:
     92   mova    [dstq          ], m0
     93   mova    [dstq+strideq  ], m0
     94   mova    [dstq+strideq*2], m0
     95   mova    [dstq+stride3q ], m0
     96   lea                 dstq, [dstq+strideq*4]
     97   dec              lines4d
     98   jnz .loop
     99 
    100   RESTORE_GOT
    101   REP_RET
    102 
    103 INIT_XMM sse2
    104 cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
    105   GET_GOT     goffsetq
    106 
    107   pxor                  m1, m1
    108   mova                  m0, [aboveq]
    109   mova                  m2, [aboveq+16]
    110   mova                  m3, [leftq]
    111   mova                  m4, [leftq+16]
    112   DEFINE_ARGS dst, stride, stride3, lines4
    113   lea             stride3q, [strideq*3]
    114   mov              lines4d, 8
    115   psadbw                m0, m1
    116   psadbw                m2, m1
    117   psadbw                m3, m1
    118   psadbw                m4, m1
    119   paddw                 m0, m2
    120   paddw                 m0, m3
    121   paddw                 m0, m4
    122   movhlps               m2, m0
    123   paddw                 m0, m2
    124   paddw                 m0, [GLOBAL(pw_32)]
    125   psraw                 m0, 6
    126   pshuflw               m0, m0, 0x0
    127   punpcklqdq            m0, m0
    128   packuswb              m0, m0
    129 .loop:
    130   mova [dstq             ], m0
    131   mova [dstq          +16], m0
    132   mova [dstq+strideq     ], m0
    133   mova [dstq+strideq  +16], m0
    134   mova [dstq+strideq*2   ], m0
    135   mova [dstq+strideq*2+16], m0
    136   mova [dstq+stride3q    ], m0
    137   mova [dstq+stride3q +16], m0
    138   lea                 dstq, [dstq+strideq*4]
    139   dec              lines4d
    140   jnz .loop
    141 
    142   RESTORE_GOT
    143   REP_RET
    144 
    145 INIT_MMX sse
    146 cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
    147   movd                  m0, [aboveq]
    148   movd      [dstq        ], m0
    149   movd      [dstq+strideq], m0
    150   lea                 dstq, [dstq+strideq*2]
    151   movd      [dstq        ], m0
    152   movd      [dstq+strideq], m0
    153   RET
    154 
    155 INIT_MMX sse
    156 cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
    157   movq                  m0, [aboveq]
    158   DEFINE_ARGS dst, stride, stride3
    159   lea             stride3q, [strideq*3]
    160   movq    [dstq          ], m0
    161   movq    [dstq+strideq  ], m0
    162   movq    [dstq+strideq*2], m0
    163   movq    [dstq+stride3q ], m0
    164   lea                 dstq, [dstq+strideq*4]
    165   movq    [dstq          ], m0
    166   movq    [dstq+strideq  ], m0
    167   movq    [dstq+strideq*2], m0
    168   movq    [dstq+stride3q ], m0
    169   RET
    170 
    171 INIT_XMM sse2
    172 cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
    173   mova                  m0, [aboveq]
    174   DEFINE_ARGS dst, stride, stride3, nlines4
    175   lea             stride3q, [strideq*3]
    176   mov              nlines4d, 4
    177 .loop:
    178   mova    [dstq          ], m0
    179   mova    [dstq+strideq  ], m0
    180   mova    [dstq+strideq*2], m0
    181   mova    [dstq+stride3q ], m0
    182   lea                 dstq, [dstq+strideq*4]
    183   dec             nlines4d
    184   jnz .loop
    185   REP_RET
    186 
    187 INIT_XMM sse2
    188 cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
    189   mova                  m0, [aboveq]
    190   mova                  m1, [aboveq+16]
    191   DEFINE_ARGS dst, stride, stride3, nlines4
    192   lea             stride3q, [strideq*3]
    193   mov              nlines4d, 8
    194 .loop:
    195   mova [dstq             ], m0
    196   mova [dstq          +16], m1
    197   mova [dstq+strideq     ], m0
    198   mova [dstq+strideq  +16], m1
    199   mova [dstq+strideq*2   ], m0
    200   mova [dstq+strideq*2+16], m1
    201   mova [dstq+stride3q    ], m0
    202   mova [dstq+stride3q +16], m1
    203   lea                 dstq, [dstq+strideq*4]
    204   dec             nlines4d
    205   jnz .loop
    206   REP_RET
    207 
    208 INIT_MMX sse
    209 cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left
    210   pxor                  m1, m1
    211   movd                  m2, [aboveq-1]
    212   movd                  m0, [aboveq]
    213   punpcklbw             m2, m1
    214   punpcklbw             m0, m1
    215   pshufw                m2, m2, 0x0
    216   DEFINE_ARGS dst, stride, line, left
    217   mov                lineq, -2
    218   add                leftq, 4
    219   psubw                 m0, m2
    220 .loop:
    221   movd                  m2, [leftq+lineq*2]
    222   movd                  m3, [leftq+lineq*2+1]
    223   punpcklbw             m2, m1
    224   punpcklbw             m3, m1
    225   pshufw                m2, m2, 0x0
    226   pshufw                m3, m3, 0x0
    227   paddw                 m2, m0
    228   paddw                 m3, m0
    229   packuswb              m2, m2
    230   packuswb              m3, m3
    231   movd      [dstq        ], m2
    232   movd      [dstq+strideq], m3
    233   lea                 dstq, [dstq+strideq*2]
    234   inc                lineq
    235   jnz .loop
    236   REP_RET
    237 
    238 INIT_XMM sse2
    239 cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left
    240   pxor                  m1, m1
    241   movd                  m2, [aboveq-1]
    242   movq                  m0, [aboveq]
    243   punpcklbw             m2, m1
    244   punpcklbw             m0, m1
    245   pshuflw               m2, m2, 0x0
    246   DEFINE_ARGS dst, stride, line, left
    247   mov                lineq, -4
    248   punpcklqdq            m2, m2
    249   add                leftq, 8
    250   psubw                 m0, m2
    251 .loop:
    252   movd                  m2, [leftq+lineq*2]
    253   movd                  m3, [leftq+lineq*2+1]
    254   punpcklbw             m2, m1
    255   punpcklbw             m3, m1
    256   pshuflw               m2, m2, 0x0
    257   pshuflw               m3, m3, 0x0
    258   punpcklqdq            m2, m2
    259   punpcklqdq            m3, m3
    260   paddw                 m2, m0
    261   paddw                 m3, m0
    262   packuswb              m2, m3
    263   movq      [dstq        ], m2
    264   movhps    [dstq+strideq], m2
    265   lea                 dstq, [dstq+strideq*2]
    266   inc                lineq
    267   jnz .loop
    268   REP_RET
    269 
    270 INIT_XMM sse2
    271 cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left
    272   pxor                  m1, m1
    273   movd                  m2, [aboveq-1]
    274   mova                  m0, [aboveq]
    275   punpcklbw             m2, m1
    276   punpckhbw             m4, m0, m1
    277   punpcklbw             m0, m1
    278   pshuflw               m2, m2, 0x0
    279   DEFINE_ARGS dst, stride, line, left
    280   mov                lineq, -8
    281   punpcklqdq            m2, m2
    282   add                leftq, 16
    283   psubw                 m0, m2
    284   psubw                 m4, m2
    285 .loop:
    286   movd                  m2, [leftq+lineq*2]
    287   movd                  m3, [leftq+lineq*2+1]
    288   punpcklbw             m2, m1
    289   punpcklbw             m3, m1
    290   pshuflw               m2, m2, 0x0
    291   pshuflw               m3, m3, 0x0
    292   punpcklqdq            m2, m2
    293   punpcklqdq            m3, m3
    294   paddw                 m5, m2, m0
    295   paddw                 m6, m3, m0
    296   paddw                 m2, m4
    297   paddw                 m3, m4
    298   packuswb              m5, m2
    299   packuswb              m6, m3
    300   mova      [dstq        ], m5
    301   mova      [dstq+strideq], m6
    302   lea                 dstq, [dstq+strideq*2]
    303   inc                lineq
    304   jnz .loop
    305   REP_RET
    306 
    307 %if ARCH_X86_64
    308 INIT_XMM sse2
    309 cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left
    310   pxor                  m1, m1
    311   movd                  m2, [aboveq-1]
    312   mova                  m0, [aboveq]
    313   mova                  m4, [aboveq+16]
    314   punpcklbw             m2, m1
    315   punpckhbw             m3, m0, m1
    316   punpckhbw             m5, m4, m1
    317   punpcklbw             m0, m1
    318   punpcklbw             m4, m1
    319   pshuflw               m2, m2, 0x0
    320   DEFINE_ARGS dst, stride, line, left
    321   mov                lineq, -16
    322   punpcklqdq            m2, m2
    323   add                leftq, 32
    324   psubw                 m0, m2
    325   psubw                 m3, m2
    326   psubw                 m4, m2
    327   psubw                 m5, m2
    328 .loop:
    329   movd                  m2, [leftq+lineq*2]
    330   movd                  m6, [leftq+lineq*2+1]
    331   punpcklbw             m2, m1
    332   punpcklbw             m6, m1
    333   pshuflw               m2, m2, 0x0
    334   pshuflw               m6, m6, 0x0
    335   punpcklqdq            m2, m2
    336   punpcklqdq            m6, m6
    337   paddw                 m7, m2, m0
    338   paddw                 m8, m2, m3
    339   paddw                 m9, m2, m4
    340   paddw                 m2, m5
    341   packuswb              m7, m8
    342   packuswb              m9, m2
    343   paddw                 m2, m6, m0
    344   paddw                 m8, m6, m3
    345   mova   [dstq           ], m7
    346   paddw                 m7, m6, m4
    347   paddw                 m6, m5
    348   mova   [dstq        +16], m9
    349   packuswb              m2, m8
    350   packuswb              m7, m6
    351   mova   [dstq+strideq   ], m2
    352   mova   [dstq+strideq+16], m7
    353   lea                 dstq, [dstq+strideq*2]
    354   inc                lineq
    355   jnz .loop
    356   REP_RET
    357 %endif
    358