Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %include "third_party/x86inc/x86inc.asm"
     12 
     13 SECTION_RODATA
     14 pb_1: times 16 db 1
     15 pw_4:  times 8 dw 4
     16 pw_8:  times 8 dw 8
     17 pw_16: times 8 dw 16
     18 pw_32: times 8 dw 32
     19 dc_128: times 16 db 128
     20 pw2_4:  times 8 dw 2
     21 pw2_8:  times 8 dw 4
     22 pw2_16:  times 8 dw 8
     23 pw2_32:  times 8 dw 16
     24 
     25 SECTION .text
     26 
     27 ; ------------------------------------------
     28 ; input: x, y, z, result
     29 ;
     30 ; trick from pascal
     31 ; (x+2y+z+2)>>2 can be calculated as:
     32 ; result = avg(x,z)
     33 ; result -= xor(x,z) & 1
     34 ; result = avg(result,y)
     35 ; ------------------------------------------
     36 %macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
     37   pavgb               %4, %1, %3
     38   pxor                %3, %1
     39   pand                %3, [GLOBAL(pb_1)]
     40   psubb               %4, %3
     41   pavgb               %4, %2
     42 %endmacro
     43 
     44 INIT_XMM sse2
     45 cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
     46   GET_GOT     goffsetq
     47 
     48   movq                 m0, [aboveq]
     49   DEFINE_ARGS dst, stride, temp
     50   psrldq               m1, m0, 1
     51   psrldq               m2, m0, 2
     52   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
     53 
     54   ; store 4 lines
     55   movd   [dstq          ], m3
     56   psrlq                m3, 8
     57   movd   [dstq+strideq  ], m3
     58   lea                dstq, [dstq+strideq*2]
     59   psrlq                m3, 8
     60   movd   [dstq          ], m3
     61   psrlq                m3, 8
     62   movd   [dstq+strideq  ], m3
     63   psrlq                m0, 56
     64   movd              tempd, m0
     65   mov    [dstq+strideq+3], tempb
     66 
     67   RESTORE_GOT
     68   RET
     69 
     70 INIT_XMM sse2
     71 cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
     72   GET_GOT     goffsetq
     73 
     74   movu                m1, [aboveq]
     75   pslldq              m0, m1, 1
     76   psrldq              m2, m1, 1
     77   DEFINE_ARGS dst, stride, stride3
     78   lea           stride3q, [strideq*3]
     79   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
     80   punpckhbw           m0, m0 ; 7 7
     81   punpcklwd           m0, m0 ; 7 7 7 7
     82   punpckldq           m0, m0 ; 7 7 7 7 7 7 7 7
     83   punpcklqdq          m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7
     84 
     85  ; store 4 lines
     86   psrldq                m3, 1
     87   movq    [dstq          ], m3
     88   psrldq                m3, 1
     89   movq    [dstq+strideq  ], m3
     90   psrldq                m3, 1
     91   movq    [dstq+strideq*2], m3
     92   psrldq                m3, 1
     93   movq    [dstq+stride3q ], m3
     94   lea                 dstq, [dstq+strideq*4]
     95 
     96   ; store next 4 lines
     97   psrldq                m3, 1
     98   movq    [dstq          ], m3
     99   psrldq                m3, 1
    100   movq    [dstq+strideq  ], m3
    101   psrldq                m3, 1
    102   movq    [dstq+strideq*2], m3
    103   psrldq                m3, 1
    104   movq    [dstq+stride3q ], m3
    105 
    106   RESTORE_GOT
    107   RET
    108 
    109 INIT_XMM sse2
    110 cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset
    111   GET_GOT     goffsetq
    112 
    113   movd                m0, [leftq]                ; abcd [byte]
    114   punpcklbw           m4, m0, m0                 ; aabb ccdd
    115   punpcklwd           m4, m4                     ; aaaa bbbb cccc dddd
    116   psrldq              m4, 12                     ; dddd
    117   punpckldq           m0, m4                     ; abcd dddd
    118   psrldq              m1, m0, 1                  ; bcdd
    119   psrldq              m2, m0, 2                  ; cddd
    120 
    121   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3   ; a2bc b2cd c3d d
    122   pavgb               m1, m0                     ; ab, bc, cd, d [byte]
    123 
    124   punpcklbw           m1, m3             ; ab, a2bc, bc, b2cd, cd, c3d, d, d
    125   movd    [dstq        ], m1
    126   psrlq               m1, 16             ; bc, b2cd, cd, c3d, d, d
    127   movd    [dstq+strideq], m1
    128 
    129   lea               dstq, [dstq+strideq*2]
    130   psrlq               m1, 16             ; cd, c3d, d, d
    131   movd    [dstq        ], m1
    132   movd    [dstq+strideq], m4             ; d, d, d, d
    133   RESTORE_GOT
    134   RET
    135 
    136 INIT_XMM sse2
    137 cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
    138   GET_GOT     goffsetq
    139 
    140   movd                  m2, [leftq]
    141   movd                  m0, [aboveq]
    142   pxor                  m1, m1
    143   punpckldq             m0, m2
    144   psadbw                m0, m1
    145   paddw                 m0, [GLOBAL(pw_4)]
    146   psraw                 m0, 3
    147   pshuflw               m0, m0, 0x0
    148   packuswb              m0, m0
    149   movd      [dstq        ], m0
    150   movd      [dstq+strideq], m0
    151   lea                 dstq, [dstq+strideq*2]
    152   movd      [dstq        ], m0
    153   movd      [dstq+strideq], m0
    154 
    155   RESTORE_GOT
    156   RET
    157 
    158 INIT_XMM sse2
    159 cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
    160   movifnidn          leftq, leftmp
    161   GET_GOT     goffsetq
    162 
    163   pxor                  m1, m1
    164   movd                  m0, [leftq]
    165   psadbw                m0, m1
    166   paddw                 m0, [GLOBAL(pw2_4)]
    167   psraw                 m0, 2
    168   pshuflw               m0, m0, 0x0
    169   packuswb              m0, m0
    170   movd      [dstq        ], m0
    171   movd      [dstq+strideq], m0
    172   lea                 dstq, [dstq+strideq*2]
    173   movd      [dstq        ], m0
    174   movd      [dstq+strideq], m0
    175 
    176   RESTORE_GOT
    177   RET
    178 
    179 INIT_XMM sse2
    180 cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
    181   GET_GOT     goffsetq
    182 
    183   pxor                  m1, m1
    184   movd                  m0, [aboveq]
    185   psadbw                m0, m1
    186   paddw                 m0, [GLOBAL(pw2_4)]
    187   psraw                 m0, 2
    188   pshuflw               m0, m0, 0x0
    189   packuswb              m0, m0
    190   movd      [dstq        ], m0
    191   movd      [dstq+strideq], m0
    192   lea                 dstq, [dstq+strideq*2]
    193   movd      [dstq        ], m0
    194   movd      [dstq+strideq], m0
    195 
    196   RESTORE_GOT
    197   RET
    198 
    199 INIT_XMM sse2
    200 cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
    201   GET_GOT     goffsetq
    202 
    203   pxor                  m1, m1
    204   movq                  m0, [aboveq]
    205   movq                  m2, [leftq]
    206   DEFINE_ARGS dst, stride, stride3
    207   lea             stride3q, [strideq*3]
    208   psadbw                m0, m1
    209   psadbw                m2, m1
    210   paddw                 m0, m2
    211   paddw                 m0, [GLOBAL(pw_8)]
    212   psraw                 m0, 4
    213   punpcklbw             m0, m0
    214   pshuflw               m0, m0, 0x0
    215   movq    [dstq          ], m0
    216   movq    [dstq+strideq  ], m0
    217   movq    [dstq+strideq*2], m0
    218   movq    [dstq+stride3q ], m0
    219   lea                 dstq, [dstq+strideq*4]
    220   movq    [dstq          ], m0
    221   movq    [dstq+strideq  ], m0
    222   movq    [dstq+strideq*2], m0
    223   movq    [dstq+stride3q ], m0
    224 
    225   RESTORE_GOT
    226   RET
    227 
    228 INIT_XMM sse2
    229 cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
    230   GET_GOT     goffsetq
    231 
    232   pxor                  m1, m1
    233   movq                  m0, [aboveq]
    234   DEFINE_ARGS dst, stride, stride3
    235   lea             stride3q, [strideq*3]
    236   psadbw                m0, m1
    237   paddw                 m0, [GLOBAL(pw2_8)]
    238   psraw                 m0, 3
    239   punpcklbw             m0, m0
    240   pshuflw               m0, m0, 0x0
    241   movq    [dstq          ], m0
    242   movq    [dstq+strideq  ], m0
    243   movq    [dstq+strideq*2], m0
    244   movq    [dstq+stride3q ], m0
    245   lea                 dstq, [dstq+strideq*4]
    246   movq    [dstq          ], m0
    247   movq    [dstq+strideq  ], m0
    248   movq    [dstq+strideq*2], m0
    249   movq    [dstq+stride3q ], m0
    250 
    251   RESTORE_GOT
    252   RET
    253 
    254 INIT_XMM sse2
    255 cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
    256   movifnidn          leftq, leftmp
    257   GET_GOT     goffsetq
    258 
    259   pxor                  m1, m1
    260   movq                  m0, [leftq]
    261   DEFINE_ARGS dst, stride, stride3
    262   lea             stride3q, [strideq*3]
    263   psadbw                m0, m1
    264   paddw                 m0, [GLOBAL(pw2_8)]
    265   psraw                 m0, 3
    266   punpcklbw             m0, m0
    267   pshuflw               m0, m0, 0x0
    268   movq    [dstq          ], m0
    269   movq    [dstq+strideq  ], m0
    270   movq    [dstq+strideq*2], m0
    271   movq    [dstq+stride3q ], m0
    272   lea                 dstq, [dstq+strideq*4]
    273   movq    [dstq          ], m0
    274   movq    [dstq+strideq  ], m0
    275   movq    [dstq+strideq*2], m0
    276   movq    [dstq+stride3q ], m0
    277 
    278   RESTORE_GOT
    279   RET
    280 
    281 INIT_XMM sse2
    282 cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
    283   GET_GOT     goffsetq
    284 
    285   DEFINE_ARGS dst, stride, stride3
    286   lea             stride3q, [strideq*3]
    287   movd     m0,        [GLOBAL(dc_128)]
    288   movd    [dstq          ], m0
    289   movd    [dstq+strideq  ], m0
    290   movd    [dstq+strideq*2], m0
    291   movd    [dstq+stride3q ], m0
    292   RESTORE_GOT
    293   RET
    294 
    295 INIT_XMM sse2
    296 cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
    297   GET_GOT     goffsetq
    298 
    299   DEFINE_ARGS dst, stride, stride3
    300   lea             stride3q, [strideq*3]
    301   movq    m0,        [GLOBAL(dc_128)]
    302   movq    [dstq          ], m0
    303   movq    [dstq+strideq  ], m0
    304   movq    [dstq+strideq*2], m0
    305   movq    [dstq+stride3q ], m0
    306   lea                 dstq, [dstq+strideq*4]
    307   movq    [dstq          ], m0
    308   movq    [dstq+strideq  ], m0
    309   movq    [dstq+strideq*2], m0
    310   movq    [dstq+stride3q ], m0
    311   RESTORE_GOT
    312   RET
    313 
    314 INIT_XMM sse2
    315 cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
    316   GET_GOT     goffsetq
    317 
    318   pxor                  m1, m1
    319   mova                  m0, [aboveq]
    320   mova                  m2, [leftq]
    321   DEFINE_ARGS dst, stride, stride3, lines4
    322   lea             stride3q, [strideq*3]
    323   mov              lines4d, 4
    324   psadbw                m0, m1
    325   psadbw                m2, m1
    326   paddw                 m0, m2
    327   movhlps               m2, m0
    328   paddw                 m0, m2
    329   paddw                 m0, [GLOBAL(pw_16)]
    330   psraw                 m0, 5
    331   pshuflw               m0, m0, 0x0
    332   punpcklqdq            m0, m0
    333   packuswb              m0, m0
    334 .loop:
    335   mova    [dstq          ], m0
    336   mova    [dstq+strideq  ], m0
    337   mova    [dstq+strideq*2], m0
    338   mova    [dstq+stride3q ], m0
    339   lea                 dstq, [dstq+strideq*4]
    340   dec              lines4d
    341   jnz .loop
    342 
    343   RESTORE_GOT
    344   REP_RET
    345 
    346 
    347 INIT_XMM sse2
    348 cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
    349   GET_GOT     goffsetq
    350 
    351   pxor                  m1, m1
    352   mova                  m0, [aboveq]
    353   DEFINE_ARGS dst, stride, stride3, lines4
    354   lea             stride3q, [strideq*3]
    355   mov              lines4d, 4
    356   psadbw                m0, m1
    357   movhlps               m2, m0
    358   paddw                 m0, m2
    359   paddw                 m0, [GLOBAL(pw2_16)]
    360   psraw                 m0, 4
    361   pshuflw               m0, m0, 0x0
    362   punpcklqdq            m0, m0
    363   packuswb              m0, m0
    364 .loop:
    365   mova    [dstq          ], m0
    366   mova    [dstq+strideq  ], m0
    367   mova    [dstq+strideq*2], m0
    368   mova    [dstq+stride3q ], m0
    369   lea                 dstq, [dstq+strideq*4]
    370   dec              lines4d
    371   jnz .loop
    372 
    373   RESTORE_GOT
    374   REP_RET
    375 
    376 INIT_XMM sse2
    377 cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
    378   GET_GOT     goffsetq
    379 
    380   pxor                  m1, m1
    381   mova                  m0, [leftq]
    382   DEFINE_ARGS dst, stride, stride3, lines4
    383   lea             stride3q, [strideq*3]
    384   mov              lines4d, 4
    385   psadbw                m0, m1
    386   movhlps               m2, m0
    387   paddw                 m0, m2
    388   paddw                 m0, [GLOBAL(pw2_16)]
    389   psraw                 m0, 4
    390   pshuflw               m0, m0, 0x0
    391   punpcklqdq            m0, m0
    392   packuswb              m0, m0
    393 .loop:
    394   mova    [dstq          ], m0
    395   mova    [dstq+strideq  ], m0
    396   mova    [dstq+strideq*2], m0
    397   mova    [dstq+stride3q ], m0
    398   lea                 dstq, [dstq+strideq*4]
    399   dec              lines4d
    400   jnz .loop
    401 
    402   RESTORE_GOT
    403   REP_RET
    404 
    405 INIT_XMM sse2
    406 cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
    407   GET_GOT     goffsetq
    408 
    409   DEFINE_ARGS dst, stride, stride3, lines4
    410   lea             stride3q, [strideq*3]
    411   mov              lines4d, 4
    412   mova    m0,        [GLOBAL(dc_128)]
    413 .loop:
    414   mova    [dstq          ], m0
    415   mova    [dstq+strideq  ], m0
    416   mova    [dstq+strideq*2], m0
    417   mova    [dstq+stride3q ], m0
    418   lea                 dstq, [dstq+strideq*4]
    419   dec              lines4d
    420   jnz .loop
    421   RESTORE_GOT
    422   RET
    423 
    424 
    425 INIT_XMM sse2
    426 cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
    427   GET_GOT     goffsetq
    428 
    429   pxor                  m1, m1
    430   mova                  m0, [aboveq]
    431   mova                  m2, [aboveq+16]
    432   mova                  m3, [leftq]
    433   mova                  m4, [leftq+16]
    434   DEFINE_ARGS dst, stride, stride3, lines4
    435   lea             stride3q, [strideq*3]
    436   mov              lines4d, 8
    437   psadbw                m0, m1
    438   psadbw                m2, m1
    439   psadbw                m3, m1
    440   psadbw                m4, m1
    441   paddw                 m0, m2
    442   paddw                 m0, m3
    443   paddw                 m0, m4
    444   movhlps               m2, m0
    445   paddw                 m0, m2
    446   paddw                 m0, [GLOBAL(pw_32)]
    447   psraw                 m0, 6
    448   pshuflw               m0, m0, 0x0
    449   punpcklqdq            m0, m0
    450   packuswb              m0, m0
    451 .loop:
    452   mova [dstq             ], m0
    453   mova [dstq          +16], m0
    454   mova [dstq+strideq     ], m0
    455   mova [dstq+strideq  +16], m0
    456   mova [dstq+strideq*2   ], m0
    457   mova [dstq+strideq*2+16], m0
    458   mova [dstq+stride3q    ], m0
    459   mova [dstq+stride3q +16], m0
    460   lea                 dstq, [dstq+strideq*4]
    461   dec              lines4d
    462   jnz .loop
    463 
    464   RESTORE_GOT
    465   REP_RET
    466 
    467 INIT_XMM sse2
    468 cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
    469   GET_GOT     goffsetq
    470 
    471   pxor                  m1, m1
    472   mova                  m0, [aboveq]
    473   mova                  m2, [aboveq+16]
    474   DEFINE_ARGS dst, stride, stride3, lines4
    475   lea             stride3q, [strideq*3]
    476   mov              lines4d, 8
    477   psadbw                m0, m1
    478   psadbw                m2, m1
    479   paddw                 m0, m2
    480   movhlps               m2, m0
    481   paddw                 m0, m2
    482   paddw                 m0, [GLOBAL(pw2_32)]
    483   psraw                 m0, 5
    484   pshuflw               m0, m0, 0x0
    485   punpcklqdq            m0, m0
    486   packuswb              m0, m0
    487 .loop:
    488   mova [dstq             ], m0
    489   mova [dstq          +16], m0
    490   mova [dstq+strideq     ], m0
    491   mova [dstq+strideq  +16], m0
    492   mova [dstq+strideq*2   ], m0
    493   mova [dstq+strideq*2+16], m0
    494   mova [dstq+stride3q    ], m0
    495   mova [dstq+stride3q +16], m0
    496   lea                 dstq, [dstq+strideq*4]
    497   dec              lines4d
    498   jnz .loop
    499 
    500   RESTORE_GOT
    501   REP_RET
    502 
    503 INIT_XMM sse2
    504 cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
    505   GET_GOT     goffsetq
    506 
    507   pxor                  m1, m1
    508   mova                  m0, [leftq]
    509   mova                  m2, [leftq+16]
    510   DEFINE_ARGS dst, stride, stride3, lines4
    511   lea             stride3q, [strideq*3]
    512   mov              lines4d, 8
    513   psadbw                m0, m1
    514   psadbw                m2, m1
    515   paddw                 m0, m2
    516   movhlps               m2, m0
    517   paddw                 m0, m2
    518   paddw                 m0, [GLOBAL(pw2_32)]
    519   psraw                 m0, 5
    520   pshuflw               m0, m0, 0x0
    521   punpcklqdq            m0, m0
    522   packuswb              m0, m0
    523 .loop:
    524   mova [dstq             ], m0
    525   mova [dstq          +16], m0
    526   mova [dstq+strideq     ], m0
    527   mova [dstq+strideq  +16], m0
    528   mova [dstq+strideq*2   ], m0
    529   mova [dstq+strideq*2+16], m0
    530   mova [dstq+stride3q    ], m0
    531   mova [dstq+stride3q +16], m0
    532   lea                 dstq, [dstq+strideq*4]
    533   dec              lines4d
    534   jnz .loop
    535 
    536   RESTORE_GOT
    537   REP_RET
    538 
    539 INIT_XMM sse2
    540 cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
    541   GET_GOT     goffsetq
    542 
    543   DEFINE_ARGS dst, stride, stride3, lines4
    544   lea             stride3q, [strideq*3]
    545   mov              lines4d, 8
    546   mova    m0,        [GLOBAL(dc_128)]
    547 .loop:
    548   mova [dstq             ], m0
    549   mova [dstq          +16], m0
    550   mova [dstq+strideq     ], m0
    551   mova [dstq+strideq  +16], m0
    552   mova [dstq+strideq*2   ], m0
    553   mova [dstq+strideq*2+16], m0
    554   mova [dstq+stride3q    ], m0
    555   mova [dstq+stride3q +16], m0
    556   lea                 dstq, [dstq+strideq*4]
    557   dec              lines4d
    558   jnz .loop
    559   RESTORE_GOT
    560   RET
    561 
    562 INIT_XMM sse2
    563 cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
    564   movd                  m0, [aboveq]
    565   movd      [dstq        ], m0
    566   movd      [dstq+strideq], m0
    567   lea                 dstq, [dstq+strideq*2]
    568   movd      [dstq        ], m0
    569   movd      [dstq+strideq], m0
    570   RET
    571 
    572 INIT_XMM sse2
    573 cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
    574   movq                  m0, [aboveq]
    575   DEFINE_ARGS dst, stride, stride3
    576   lea             stride3q, [strideq*3]
    577   movq    [dstq          ], m0
    578   movq    [dstq+strideq  ], m0
    579   movq    [dstq+strideq*2], m0
    580   movq    [dstq+stride3q ], m0
    581   lea                 dstq, [dstq+strideq*4]
    582   movq    [dstq          ], m0
    583   movq    [dstq+strideq  ], m0
    584   movq    [dstq+strideq*2], m0
    585   movq    [dstq+stride3q ], m0
    586   RET
    587 
    588 INIT_XMM sse2
    589 cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
    590   mova                  m0, [aboveq]
    591   DEFINE_ARGS dst, stride, stride3, nlines4
    592   lea             stride3q, [strideq*3]
    593   mov              nlines4d, 4
    594 .loop:
    595   mova    [dstq          ], m0
    596   mova    [dstq+strideq  ], m0
    597   mova    [dstq+strideq*2], m0
    598   mova    [dstq+stride3q ], m0
    599   lea                 dstq, [dstq+strideq*4]
    600   dec             nlines4d
    601   jnz .loop
    602   REP_RET
    603 
    604 INIT_XMM sse2
    605 cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
    606   mova                  m0, [aboveq]
    607   mova                  m1, [aboveq+16]
    608   DEFINE_ARGS dst, stride, stride3, nlines4
    609   lea             stride3q, [strideq*3]
    610   mov              nlines4d, 8
    611 .loop:
    612   mova [dstq             ], m0
    613   mova [dstq          +16], m1
    614   mova [dstq+strideq     ], m0
    615   mova [dstq+strideq  +16], m1
    616   mova [dstq+strideq*2   ], m0
    617   mova [dstq+strideq*2+16], m1
    618   mova [dstq+stride3q    ], m0
    619   mova [dstq+stride3q +16], m1
    620   lea                 dstq, [dstq+strideq*4]
    621   dec             nlines4d
    622   jnz .loop
    623   REP_RET
    624 
    625 INIT_XMM sse2
    626 cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
    627   movifnidn          leftq, leftmp
    628   movd                  m0, [leftq]
    629   punpcklbw             m0, m0
    630   punpcklbw             m0, m0
    631   pshufd                m1, m0, 0x1
    632   movd      [dstq        ], m0
    633   movd      [dstq+strideq], m1
    634   pshufd                m2, m0, 0x2
    635   lea                 dstq, [dstq+strideq*2]
    636   pshufd                m3, m0, 0x3
    637   movd      [dstq        ], m2
    638   movd      [dstq+strideq], m3
    639   RET
    640 
    641 INIT_XMM sse2
    642 cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
    643   movifnidn          leftq, leftmp
    644   mov                lineq, -2
    645   DEFINE_ARGS  dst, stride, line, left, stride3
    646   lea             stride3q, [strideq*3]
    647   movq                  m0, [leftq    ]
    648   punpcklbw             m0, m0              ; l1 l1 l2 l2 ... l8 l8
    649 .loop:
    650   pshuflw               m1, m0, 0x0         ; l1 l1 l1 l1 l1 l1 l1 l1
    651   pshuflw               m2, m0, 0x55        ; l2 l2 l2 l2 l2 l2 l2 l2
    652   movq      [dstq        ], m1
    653   movq      [dstq+strideq], m2
    654   pshuflw               m1, m0, 0xaa
    655   pshuflw               m2, m0, 0xff
    656   movq    [dstq+strideq*2], m1
    657   movq    [dstq+stride3q ], m2
    658   pshufd                m0, m0, 0xe         ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
    659   inc                lineq
    660   lea                 dstq, [dstq+strideq*4]
    661   jnz .loop
    662   REP_RET
    663 
    664 INIT_XMM sse2
    665 cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
    666   movifnidn          leftq, leftmp
    667   mov                lineq, -4
    668   DEFINE_ARGS dst, stride, line, left, stride3
    669   lea             stride3q, [strideq*3]
    670 .loop:
    671   movd                  m0, [leftq]
    672   punpcklbw             m0, m0
    673   punpcklbw             m0, m0              ; l1 to l4 each repeated 4 times
    674   pshufd            m1, m0, 0x0             ; l1 repeated 16 times
    675   pshufd            m2, m0, 0x55            ; l2 repeated 16 times
    676   mova    [dstq          ], m1
    677   mova    [dstq+strideq  ], m2
    678   pshufd            m1, m0, 0xaa
    679   pshufd            m2, m0, 0xff
    680   mova    [dstq+strideq*2], m1
    681   mova    [dstq+stride3q ], m2
    682   inc                lineq
    683   lea                leftq, [leftq+4       ]
    684   lea                 dstq, [dstq+strideq*4]
    685   jnz .loop
    686   REP_RET
    687 
    688 INIT_XMM sse2
    689 cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
    690   movifnidn              leftq, leftmp
    691   mov                    lineq, -8
    692   DEFINE_ARGS dst, stride, line, left, stride3
    693   lea                 stride3q, [strideq*3]
    694 .loop:
    695   movd                      m0, [leftq]
    696   punpcklbw                 m0, m0
    697   punpcklbw                 m0, m0              ; l1 to l4 each repeated 4 times
    698   pshufd                m1, m0, 0x0             ; l1 repeated 16 times
    699   pshufd                m2, m0, 0x55            ; l2 repeated 16 times
    700   mova     [dstq             ], m1
    701   mova     [dstq+16          ], m1
    702   mova     [dstq+strideq     ], m2
    703   mova     [dstq+strideq+16  ], m2
    704   pshufd                m1, m0, 0xaa
    705   pshufd                m2, m0, 0xff
    706   mova     [dstq+strideq*2   ], m1
    707   mova     [dstq+strideq*2+16], m1
    708   mova     [dstq+stride3q    ], m2
    709   mova     [dstq+stride3q+16 ], m2
    710   inc                    lineq
    711   lea                    leftq, [leftq+4       ]
    712   lea                     dstq, [dstq+strideq*4]
    713   jnz .loop
    714   REP_RET
    715 
    716 INIT_XMM sse2
    717 cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left
    718   pxor                  m1, m1
    719   movq                  m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x
    720   punpcklbw             m0, m1
    721   pshuflw               m2, m0, 0x0   ; [63:0] tl tl tl tl [word]
    722   psrldq                m0, 2
    723   psubw                 m0, m2        ; [63:0] t1-tl t2-tl t3-tl t4-tl [word]
    724   movd                  m2, [leftq]
    725   punpcklbw             m2, m1
    726   pshuflw               m4, m2, 0x0   ; [63:0] l1 l1 l1 l1 [word]
    727   pshuflw               m3, m2, 0x55  ; [63:0] l2 l2 l2 l2 [word]
    728   paddw                 m4, m0
    729   paddw                 m3, m0
    730   packuswb              m4, m4
    731   packuswb              m3, m3
    732   movd      [dstq        ], m4
    733   movd      [dstq+strideq], m3
    734   lea                 dstq, [dstq+strideq*2]
    735   pshuflw               m4, m2, 0xaa
    736   pshuflw               m3, m2, 0xff
    737   paddw                 m4, m0
    738   paddw                 m3, m0
    739   packuswb              m4, m4
    740   packuswb              m3, m3
    741   movd      [dstq        ], m4
    742   movd      [dstq+strideq], m3
    743   RET
    744 
    745 INIT_XMM sse2
    746 cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left
    747   pxor                  m1, m1
    748   movd                  m2, [aboveq-1]
    749   movq                  m0, [aboveq]
    750   punpcklbw             m2, m1
    751   punpcklbw             m0, m1        ; t1 t2 t3 t4 t5 t6 t7 t8 [word]
    752   pshuflw               m2, m2, 0x0   ; [63:0] tl tl tl tl [word]
    753   DEFINE_ARGS dst, stride, line, left
    754   mov                lineq, -4
    755   punpcklqdq            m2, m2        ; tl tl tl tl tl tl tl tl [word]
    756   psubw                 m0, m2        ; t1-tl t2-tl ... t8-tl [word]
    757   movq                  m2, [leftq]
    758   punpcklbw             m2, m1        ; l1 l2 l3 l4 l5 l6 l7 l8 [word]
    759 .loop:
    760   pshuflw               m4, m2, 0x0   ; [63:0] l1 l1 l1 l1 [word]
    761   pshuflw               m3, m2, 0x55  ; [63:0] l2 l2 l2 l2 [word]
    762   punpcklqdq            m4, m4        ; l1 l1 l1 l1 l1 l1 l1 l1 [word]
    763   punpcklqdq            m3, m3        ; l2 l2 l2 l2 l2 l2 l2 l2 [word]
    764   paddw                 m4, m0
    765   paddw                 m3, m0
    766   packuswb              m4, m3
    767   movq      [dstq        ], m4
    768   movhps    [dstq+strideq], m4
    769   lea                 dstq, [dstq+strideq*2]
    770   psrldq                m2, 4
    771   inc                lineq
    772   jnz .loop
    773   REP_RET
    774 
    775 INIT_XMM sse2
    776 cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left
    777   pxor                  m1, m1
    778   mova                  m2, [aboveq-16];
    779   mova                  m0, [aboveq]   ; t1 t2 ... t16 [byte]
    780   punpckhbw             m2, m1         ; [127:112] tl [word]
    781   punpckhbw             m4, m0, m1
    782   punpcklbw             m0, m1         ; m0:m4 t1 t2 ... t16 [word]
    783   DEFINE_ARGS dst, stride, line, left, stride8
    784   mov                lineq, -8
    785   pshufhw               m2, m2, 0xff
    786   mova                  m3, [leftq]    ; l1 l2 ... l16 [byte]
    787   punpckhqdq            m2, m2         ; tl repeated 8 times [word]
    788   psubw                 m0, m2
    789   psubw                 m4, m2         ; m0:m4 t1-tl t2-tl ... t16-tl [word]
    790   punpckhbw             m5, m3, m1
    791   punpcklbw             m3, m1         ; m3:m5 l1 l2 ... l16 [word]
    792   lea             stride8q, [strideq*8]
    793 .loop:
    794   pshuflw               m6, m3, 0x0
    795   pshuflw               m7, m5, 0x0
    796   punpcklqdq            m6, m6         ; l1 repeated 8 times [word]
    797   punpcklqdq            m7, m7         ; l8 repeated 8 times [word]
    798   paddw                 m1, m6, m0
    799   paddw                 m6, m4         ; m1:m6 ti-tl+l1 [i=1,15] [word]
    800   psrldq                m5, 2
    801   packuswb              m1, m6
    802   mova     [dstq         ], m1
    803   paddw                 m1, m7, m0
    804   paddw                 m7, m4         ; m1:m7 ti-tl+l8 [i=1,15] [word]
    805   psrldq                m3, 2
    806   packuswb              m1, m7
    807   mova     [dstq+stride8q], m1
    808   inc                lineq
    809   lea                 dstq, [dstq+strideq]
    810   jnz .loop
    811   REP_RET
    812 
    813 INIT_XMM sse2
    814 cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left
    815   pxor                  m1, m1
    816   movd                  m2, [aboveq-1]
    817   mova                  m0, [aboveq]
    818   mova                  m4, [aboveq+16]
    819   punpcklbw             m2, m1
    820   punpckhbw             m3, m0, m1
    821   punpckhbw             m5, m4, m1
    822   punpcklbw             m0, m1
    823   punpcklbw             m4, m1
    824   pshuflw               m2, m2, 0x0
    825   DEFINE_ARGS dst, stride, line, left
    826   mov                lineq, -16
    827   punpcklqdq            m2, m2
    828   add                leftq, 32
    829   psubw                 m0, m2
    830   psubw                 m3, m2
    831   psubw                 m4, m2
    832   psubw                 m5, m2
    833 .loop:
    834   movd                  m2, [leftq+lineq*2]
    835   pxor                  m1, m1
    836   punpcklbw             m2, m1
    837   pshuflw               m7, m2, 0x55
    838   pshuflw               m2, m2, 0x0
    839   punpcklqdq            m2, m2
    840   punpcklqdq            m7, m7
    841   paddw                 m6, m2, m3
    842   paddw                 m1, m2, m0
    843   packuswb              m1, m6
    844   mova   [dstq           ], m1
    845   paddw                 m6, m2, m5
    846   paddw                 m1, m2, m4
    847   packuswb              m1, m6
    848   mova   [dstq+16        ], m1
    849   paddw                 m6, m7, m3
    850   paddw                 m1, m7, m0
    851   packuswb              m1, m6
    852   mova   [dstq+strideq   ], m1
    853   paddw                 m6, m7, m5
    854   paddw                 m1, m7, m4
    855   packuswb              m1, m6
    856   mova   [dstq+strideq+16], m1
    857   lea                 dstq, [dstq+strideq*2]
    858   inc                lineq
    859   jnz .loop
    860   REP_RET
    861