Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %include "third_party/x86inc/x86inc.asm"
     12 
     13 SECTION .text
     14 
     15 %macro HIGH_SAD_FN 4
     16 %if %4 == 0
     17 %if %3 == 5
     18 cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
     19 %else ; %3 == 7
     20 cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
     21                             src_stride3, ref_stride3, n_rows
     22 %endif ; %3 == 5/7
     23 %else ; avg
     24 %if %3 == 5
     25 cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
     26                                     second_pred, n_rows
     27 %else ; %3 == 7
     28 cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
     29                                               ref, ref_stride, \
     30                                               second_pred, \
     31                                               src_stride3, ref_stride3
     32 %if ARCH_X86_64
     33 %define n_rowsd r7d
     34 %else ; x86-32
     35 %define n_rowsd dword r0m
     36 %endif ; x86-32/64
     37 %endif ; %3 == 5/7
     38 %endif ; avg/sad
     39   movsxdifnidn src_strideq, src_strided
     40   movsxdifnidn ref_strideq, ref_strided
     41 %if %3 == 7
     42   lea         src_stride3q, [src_strideq*3]
     43   lea         ref_stride3q, [ref_strideq*3]
     44 %endif ; %3 == 7
     45 ; convert src, ref & second_pred to short ptrs (from byte ptrs)
     46   shl                 srcq, 1
     47   shl                 refq, 1
     48 %if %4 == 1
     49   shl         second_predq, 1
     50 %endif
     51 %endmacro
     52 
     53 ; unsigned int vpx_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
     54 ;                                    uint8_t *ref, int ref_stride);
     55 %macro HIGH_SAD64XN 1-2 0
     56   HIGH_SAD_FN 64, %1, 5, %2
     57   mov              n_rowsd, %1
     58   pxor                  m0, m0
     59   pxor                  m6, m6
     60 
     61 .loop:
     62   ; first half of each row
     63   movu                  m1, [refq]
     64   movu                  m2, [refq+16]
     65   movu                  m3, [refq+32]
     66   movu                  m4, [refq+48]
     67 %if %2 == 1
     68   pavgw                 m1, [second_predq+mmsize*0]
     69   pavgw                 m2, [second_predq+mmsize*1]
     70   pavgw                 m3, [second_predq+mmsize*2]
     71   pavgw                 m4, [second_predq+mmsize*3]
     72   lea         second_predq, [second_predq+mmsize*4]
     73 %endif
     74   mova                  m5, [srcq]
     75   psubusw               m5, m1
     76   psubusw               m1, [srcq]
     77   por                   m1, m5
     78   mova                  m5, [srcq+16]
     79   psubusw               m5, m2
     80   psubusw               m2, [srcq+16]
     81   por                   m2, m5
     82   mova                  m5, [srcq+32]
     83   psubusw               m5, m3
     84   psubusw               m3, [srcq+32]
     85   por                   m3, m5
     86   mova                  m5, [srcq+48]
     87   psubusw               m5, m4
     88   psubusw               m4, [srcq+48]
     89   por                   m4, m5
     90   paddw                 m1, m2
     91   paddw                 m3, m4
     92   movhlps               m2, m1
     93   movhlps               m4, m3
     94   paddw                 m1, m2
     95   paddw                 m3, m4
     96   punpcklwd             m1, m6
     97   punpcklwd             m3, m6
     98   paddd                 m0, m1
     99   paddd                 m0, m3
    100   ; second half of each row
    101   movu                  m1, [refq+64]
    102   movu                  m2, [refq+80]
    103   movu                  m3, [refq+96]
    104   movu                  m4, [refq+112]
    105 %if %2 == 1
    106   pavgw                 m1, [second_predq+mmsize*0]
    107   pavgw                 m2, [second_predq+mmsize*1]
    108   pavgw                 m3, [second_predq+mmsize*2]
    109   pavgw                 m4, [second_predq+mmsize*3]
    110   lea         second_predq, [second_predq+mmsize*4]
    111 %endif
    112   mova                  m5, [srcq+64]
    113   psubusw               m5, m1
    114   psubusw               m1, [srcq+64]
    115   por                   m1, m5
    116   mova                  m5, [srcq+80]
    117   psubusw               m5, m2
    118   psubusw               m2, [srcq+80]
    119   por                   m2, m5
    120   mova                  m5, [srcq+96]
    121   psubusw               m5, m3
    122   psubusw               m3, [srcq+96]
    123   por                   m3, m5
    124   mova                  m5, [srcq+112]
    125   psubusw               m5, m4
    126   psubusw               m4, [srcq+112]
    127   por                   m4, m5
    128   paddw                 m1, m2
    129   paddw                 m3, m4
    130   movhlps               m2, m1
    131   movhlps               m4, m3
    132   paddw                 m1, m2
    133   paddw                 m3, m4
    134   punpcklwd             m1, m6
    135   punpcklwd             m3, m6
    136   lea                 refq, [refq+ref_strideq*2]
    137   paddd                 m0, m1
    138   lea                 srcq, [srcq+src_strideq*2]
    139   paddd                 m0, m3
    140 
    141   dec              n_rowsd
    142   jg .loop
    143 
    144   movhlps               m1, m0
    145   paddd                 m0, m1
    146   punpckldq             m0, m6
    147   movhlps               m1, m0
    148   paddd                 m0, m1
    149   movd                 eax, m0
    150   RET
    151 %endmacro
    152 
    153 INIT_XMM sse2
    154 HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
    155 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
    156 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
    157 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
    158 
    159 
    160 ; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
    161 ;                                    uint8_t *ref, int ref_stride);
    162 %macro HIGH_SAD32XN 1-2 0
    163   HIGH_SAD_FN 32, %1, 5, %2
    164   mov              n_rowsd, %1
    165   pxor                  m0, m0
    166   pxor                  m6, m6
    167 
    168 .loop:
    169   movu                  m1, [refq]
    170   movu                  m2, [refq+16]
    171   movu                  m3, [refq+32]
    172   movu                  m4, [refq+48]
    173 %if %2 == 1
    174   pavgw                 m1, [second_predq+mmsize*0]
    175   pavgw                 m2, [second_predq+mmsize*1]
    176   pavgw                 m3, [second_predq+mmsize*2]
    177   pavgw                 m4, [second_predq+mmsize*3]
    178   lea         second_predq, [second_predq+mmsize*4]
    179 %endif
    180   mova                  m5, [srcq]
    181   psubusw               m5, m1
    182   psubusw               m1, [srcq]
    183   por                   m1, m5
    184   mova                  m5, [srcq+16]
    185   psubusw               m5, m2
    186   psubusw               m2, [srcq+16]
    187   por                   m2, m5
    188   mova                  m5, [srcq+32]
    189   psubusw               m5, m3
    190   psubusw               m3, [srcq+32]
    191   por                   m3, m5
    192   mova                  m5, [srcq+48]
    193   psubusw               m5, m4
    194   psubusw               m4, [srcq+48]
    195   por                   m4, m5
    196   paddw                 m1, m2
    197   paddw                 m3, m4
    198   movhlps               m2, m1
    199   movhlps               m4, m3
    200   paddw                 m1, m2
    201   paddw                 m3, m4
    202   punpcklwd             m1, m6
    203   punpcklwd             m3, m6
    204   lea                 refq, [refq+ref_strideq*2]
    205   paddd                 m0, m1
    206   lea                 srcq, [srcq+src_strideq*2]
    207   paddd                 m0, m3
    208   dec              n_rowsd
    209   jg .loop
    210 
    211   movhlps               m1, m0
    212   paddd                 m0, m1
    213   punpckldq             m0, m6
    214   movhlps               m1, m0
    215   paddd                 m0, m1
    216   movd                 eax, m0
    217   RET
    218 %endmacro
    219 
    220 INIT_XMM sse2
    221 HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
    222 HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
    223 HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
    224 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
    225 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
    226 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
    227 
    228 ; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
    229 ;                                    uint8_t *ref, int ref_stride);
    230 %macro HIGH_SAD16XN 1-2 0
    231   HIGH_SAD_FN 16, %1, 5, %2
    232   mov              n_rowsd, %1/2
    233   pxor                  m0, m0
    234   pxor                  m6, m6
    235 
    236 .loop:
    237   movu                  m1, [refq]
    238   movu                  m2, [refq+16]
    239   movu                  m3, [refq+ref_strideq*2]
    240   movu                  m4, [refq+ref_strideq*2+16]
    241 %if %2 == 1
    242   pavgw                 m1, [second_predq+mmsize*0]
    243   pavgw                 m2, [second_predq+16]
    244   pavgw                 m3, [second_predq+mmsize*2]
    245   pavgw                 m4, [second_predq+mmsize*2+16]
    246   lea         second_predq, [second_predq+mmsize*4]
    247 %endif
    248   mova                  m5, [srcq]
    249   psubusw               m5, m1
    250   psubusw               m1, [srcq]
    251   por                   m1, m5
    252   mova                  m5, [srcq+16]
    253   psubusw               m5, m2
    254   psubusw               m2, [srcq+16]
    255   por                   m2, m5
    256   mova                  m5, [srcq+src_strideq*2]
    257   psubusw               m5, m3
    258   psubusw               m3, [srcq+src_strideq*2]
    259   por                   m3, m5
    260   mova                  m5, [srcq+src_strideq*2+16]
    261   psubusw               m5, m4
    262   psubusw               m4, [srcq+src_strideq*2+16]
    263   por                   m4, m5
    264   paddw                 m1, m2
    265   paddw                 m3, m4
    266   movhlps               m2, m1
    267   movhlps               m4, m3
    268   paddw                 m1, m2
    269   paddw                 m3, m4
    270   punpcklwd             m1, m6
    271   punpcklwd             m3, m6
    272   lea                 refq, [refq+ref_strideq*4]
    273   paddd                 m0, m1
    274   lea                 srcq, [srcq+src_strideq*4]
    275   paddd                 m0, m3
    276   dec              n_rowsd
    277   jg .loop
    278 
    279   movhlps               m1, m0
    280   paddd                 m0, m1
    281   punpckldq             m0, m6
    282   movhlps               m1, m0
    283   paddd                 m0, m1
    284   movd                 eax, m0
    285   RET
    286 %endmacro
    287 
    288 INIT_XMM sse2
    289 HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
    290 HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
    291 HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
    292 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
    293 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
    294 HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
    295 
    296 
    297 ; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
    298 ;                                    uint8_t *ref, int ref_stride);
    299 %macro HIGH_SAD8XN 1-2 0
    300   HIGH_SAD_FN 8, %1, 7, %2
    301   mov              n_rowsd, %1/4
    302   pxor                  m0, m0
    303   pxor                  m6, m6
    304 
    305 .loop:
    306   movu                  m1, [refq]
    307   movu                  m2, [refq+ref_strideq*2]
    308   movu                  m3, [refq+ref_strideq*4]
    309   movu                  m4, [refq+ref_stride3q*2]
    310 %if %2 == 1
    311   pavgw                 m1, [second_predq+mmsize*0]
    312   pavgw                 m2, [second_predq+mmsize*1]
    313   pavgw                 m3, [second_predq+mmsize*2]
    314   pavgw                 m4, [second_predq+mmsize*3]
    315   lea         second_predq, [second_predq+mmsize*4]
    316 %endif
    317   mova                  m5, [srcq]
    318   psubusw               m5, m1
    319   psubusw               m1, [srcq]
    320   por                   m1, m5
    321   mova                  m5, [srcq+src_strideq*2]
    322   psubusw               m5, m2
    323   psubusw               m2, [srcq+src_strideq*2]
    324   por                   m2, m5
    325   mova                  m5, [srcq+src_strideq*4]
    326   psubusw               m5, m3
    327   psubusw               m3, [srcq+src_strideq*4]
    328   por                   m3, m5
    329   mova                  m5, [srcq+src_stride3q*2]
    330   psubusw               m5, m4
    331   psubusw               m4, [srcq+src_stride3q*2]
    332   por                   m4, m5
    333   paddw                 m1, m2
    334   paddw                 m3, m4
    335   movhlps               m2, m1
    336   movhlps               m4, m3
    337   paddw                 m1, m2
    338   paddw                 m3, m4
    339   punpcklwd             m1, m6
    340   punpcklwd             m3, m6
    341   lea                 refq, [refq+ref_strideq*8]
    342   paddd                 m0, m1
    343   lea                 srcq, [srcq+src_strideq*8]
    344   paddd                 m0, m3
    345   dec              n_rowsd
    346   jg .loop
    347 
    348   movhlps               m1, m0
    349   paddd                 m0, m1
    350   punpckldq             m0, m6
    351   movhlps               m1, m0
    352   paddd                 m0, m1
    353   movd                 eax, m0
    354   RET
    355 %endmacro
    356 
    357 INIT_XMM sse2
    358 HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
    359 HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
    360 HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
    361 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
    362 HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
    363 HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
    364