Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %include "third_party/x86inc/x86inc.asm"
     12 
     13 SECTION .text
     14 
     15 %macro SAD_FN 4
     16 %if %4 == 0
     17 %if %3 == 5
     18 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
     19 %else ; %3 == 7
     20 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
     21                             src_stride3, ref_stride3, n_rows
     22 %endif ; %3 == 5/7
     23 %else ; avg
     24 %if %3 == 5
     25 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
     26                                     second_pred, n_rows
     27 %else ; %3 == 7
     28 cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
     29                                               ref, ref_stride, \
     30                                               second_pred, \
     31                                               src_stride3, ref_stride3
     32 %if ARCH_X86_64
     33 %define n_rowsd r7d
     34 %else ; x86-32
     35 %define n_rowsd dword r0m
     36 %endif ; x86-32/64
     37 %endif ; %3 == 5/7
     38 %endif ; avg/sad
     39   movsxdifnidn src_strideq, src_strided
     40   movsxdifnidn ref_strideq, ref_strided
     41 %if %3 == 7
     42   lea         src_stride3q, [src_strideq*3]
     43   lea         ref_stride3q, [ref_strideq*3]
     44 %endif ; %3 == 7
     45 %endmacro
     46 
     47 ; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
     48 ;                                uint8_t *ref, int ref_stride);
     49 %macro SAD64XN 1-2 0
     50   SAD_FN 64, %1, 5, %2
     51   mov              n_rowsd, %1
     52   pxor                  m0, m0
     53 .loop:
     54   movu                  m1, [refq]
     55   movu                  m2, [refq+16]
     56   movu                  m3, [refq+32]
     57   movu                  m4, [refq+48]
     58 %if %2 == 1
     59   pavgb                 m1, [second_predq+mmsize*0]
     60   pavgb                 m2, [second_predq+mmsize*1]
     61   pavgb                 m3, [second_predq+mmsize*2]
     62   pavgb                 m4, [second_predq+mmsize*3]
     63   lea         second_predq, [second_predq+mmsize*4]
     64 %endif
     65   psadbw                m1, [srcq]
     66   psadbw                m2, [srcq+16]
     67   psadbw                m3, [srcq+32]
     68   psadbw                m4, [srcq+48]
     69   paddd                 m1, m2
     70   paddd                 m3, m4
     71   add                 refq, ref_strideq
     72   paddd                 m0, m1
     73   add                 srcq, src_strideq
     74   paddd                 m0, m3
     75   dec              n_rowsd
     76   jg .loop
     77 
     78   movhlps               m1, m0
     79   paddd                 m0, m1
     80   movd                 eax, m0
     81   RET
     82 %endmacro
     83 
     84 INIT_XMM sse2
     85 SAD64XN 64 ; sad64x64_sse2
     86 SAD64XN 32 ; sad64x32_sse2
     87 SAD64XN 64, 1 ; sad64x64_avg_sse2
     88 SAD64XN 32, 1 ; sad64x32_avg_sse2
     89 
     90 ; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
     91 ;                                uint8_t *ref, int ref_stride);
     92 %macro SAD32XN 1-2 0
     93   SAD_FN 32, %1, 5, %2
     94   mov              n_rowsd, %1/2
     95   pxor                  m0, m0
     96 .loop:
     97   movu                  m1, [refq]
     98   movu                  m2, [refq+16]
     99   movu                  m3, [refq+ref_strideq]
    100   movu                  m4, [refq+ref_strideq+16]
    101 %if %2 == 1
    102   pavgb                 m1, [second_predq+mmsize*0]
    103   pavgb                 m2, [second_predq+mmsize*1]
    104   pavgb                 m3, [second_predq+mmsize*2]
    105   pavgb                 m4, [second_predq+mmsize*3]
    106   lea         second_predq, [second_predq+mmsize*4]
    107 %endif
    108   psadbw                m1, [srcq]
    109   psadbw                m2, [srcq+16]
    110   psadbw                m3, [srcq+src_strideq]
    111   psadbw                m4, [srcq+src_strideq+16]
    112   paddd                 m1, m2
    113   paddd                 m3, m4
    114   lea                 refq, [refq+ref_strideq*2]
    115   paddd                 m0, m1
    116   lea                 srcq, [srcq+src_strideq*2]
    117   paddd                 m0, m3
    118   dec              n_rowsd
    119   jg .loop
    120 
    121   movhlps               m1, m0
    122   paddd                 m0, m1
    123   movd                 eax, m0
    124   RET
    125 %endmacro
    126 
    127 INIT_XMM sse2
    128 SAD32XN 64 ; sad32x64_sse2
    129 SAD32XN 32 ; sad32x32_sse2
    130 SAD32XN 16 ; sad32x16_sse2
    131 SAD32XN 64, 1 ; sad32x64_avg_sse2
    132 SAD32XN 32, 1 ; sad32x32_avg_sse2
    133 SAD32XN 16, 1 ; sad32x16_avg_sse2
    134 
    135 ; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
    136 ;                                    uint8_t *ref, int ref_stride);
    137 %macro SAD16XN 1-2 0
    138   SAD_FN 16, %1, 7, %2
    139   mov              n_rowsd, %1/4
    140   pxor                  m0, m0
    141 
    142 .loop:
    143   movu                  m1, [refq]
    144   movu                  m2, [refq+ref_strideq]
    145   movu                  m3, [refq+ref_strideq*2]
    146   movu                  m4, [refq+ref_stride3q]
    147 %if %2 == 1
    148   pavgb                 m1, [second_predq+mmsize*0]
    149   pavgb                 m2, [second_predq+mmsize*1]
    150   pavgb                 m3, [second_predq+mmsize*2]
    151   pavgb                 m4, [second_predq+mmsize*3]
    152   lea         second_predq, [second_predq+mmsize*4]
    153 %endif
    154   psadbw                m1, [srcq]
    155   psadbw                m2, [srcq+src_strideq]
    156   psadbw                m3, [srcq+src_strideq*2]
    157   psadbw                m4, [srcq+src_stride3q]
    158   paddd                 m1, m2
    159   paddd                 m3, m4
    160   lea                 refq, [refq+ref_strideq*4]
    161   paddd                 m0, m1
    162   lea                 srcq, [srcq+src_strideq*4]
    163   paddd                 m0, m3
    164   dec              n_rowsd
    165   jg .loop
    166 
    167   movhlps               m1, m0
    168   paddd                 m0, m1
    169   movd                 eax, m0
    170   RET
    171 %endmacro
    172 
    173 INIT_XMM sse2
    174 SAD16XN 32 ; sad16x32_sse2
    175 SAD16XN 16 ; sad16x16_sse2
    176 SAD16XN  8 ; sad16x8_sse2
    177 SAD16XN 32, 1 ; sad16x32_avg_sse2
    178 SAD16XN 16, 1 ; sad16x16_avg_sse2
    179 SAD16XN  8, 1 ; sad16x8_avg_sse2
    180 
    181 ; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
    182 ;                                   uint8_t *ref, int ref_stride);
    183 %macro SAD8XN 1-2 0
    184   SAD_FN 8, %1, 7, %2
    185   mov              n_rowsd, %1/4
    186   pxor                  m0, m0
    187 
    188 .loop:
    189   movh                  m1, [refq]
    190   movhps                m1, [refq+ref_strideq]
    191   movh                  m2, [refq+ref_strideq*2]
    192   movhps                m2, [refq+ref_stride3q]
    193 %if %2 == 1
    194   pavgb                 m1, [second_predq+mmsize*0]
    195   pavgb                 m2, [second_predq+mmsize*1]
    196   lea         second_predq, [second_predq+mmsize*2]
    197 %endif
    198   movh                  m3, [srcq]
    199   movhps                m3, [srcq+src_strideq]
    200   movh                  m4, [srcq+src_strideq*2]
    201   movhps                m4, [srcq+src_stride3q]
    202   psadbw                m1, m3
    203   psadbw                m2, m4
    204   lea                 refq, [refq+ref_strideq*4]
    205   paddd                 m0, m1
    206   lea                 srcq, [srcq+src_strideq*4]
    207   paddd                 m0, m2
    208   dec              n_rowsd
    209   jg .loop
    210 
    211   movhlps               m1, m0
    212   paddd                 m0, m1
    213   movd                 eax, m0
    214   RET
    215 %endmacro
    216 
    217 INIT_XMM sse2
    218 SAD8XN 16 ; sad8x16_sse2
    219 SAD8XN  8 ; sad8x8_sse2
    220 SAD8XN  4 ; sad8x4_sse2
    221 SAD8XN 16, 1 ; sad8x16_avg_sse2
    222 SAD8XN  8, 1 ; sad8x8_avg_sse2
    223 SAD8XN  4, 1 ; sad8x4_avg_sse2
    224 
    225 ; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
    226 ;                                  uint8_t *ref, int ref_stride);
    227 %macro SAD4XN 1-2 0
    228   SAD_FN 4, %1, 7, %2
    229   mov              n_rowsd, %1/4
    230   pxor                  m0, m0
    231 
    232 .loop:
    233   movd                  m1, [refq]
    234   movd                  m2, [refq+ref_strideq]
    235   movd                  m3, [refq+ref_strideq*2]
    236   movd                  m4, [refq+ref_stride3q]
    237   punpckldq             m1, m2
    238   punpckldq             m3, m4
    239 %if %2 == 1
    240   pavgb                 m1, [second_predq+mmsize*0]
    241   pavgb                 m3, [second_predq+mmsize*1]
    242   lea         second_predq, [second_predq+mmsize*2]
    243 %endif
    244   movd                  m2, [srcq]
    245   movd                  m5, [srcq+src_strideq]
    246   movd                  m4, [srcq+src_strideq*2]
    247   movd                  m6, [srcq+src_stride3q]
    248   punpckldq             m2, m5
    249   punpckldq             m4, m6
    250   psadbw                m1, m2
    251   psadbw                m3, m4
    252   lea                 refq, [refq+ref_strideq*4]
    253   paddd                 m0, m1
    254   lea                 srcq, [srcq+src_strideq*4]
    255   paddd                 m0, m3
    256   dec              n_rowsd
    257   jg .loop
    258 
    259   movd                 eax, m0
    260   RET
    261 %endmacro
    262 
    263 INIT_MMX sse
    264 SAD4XN  8 ; sad4x8_sse
    265 SAD4XN  4 ; sad4x4_sse
    266 SAD4XN  8, 1 ; sad4x8_avg_sse
    267 SAD4XN  4, 1 ; sad4x4_avg_sse
    268