Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %include "third_party/x86inc/x86inc.asm"
     12 
     13 SECTION .text
     14 
     15 ; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
     16 %macro PROCESS_4x2x4 5-6 0
     17   movd                  m0, [srcq +%2]
     18 %if %1 == 1
     19   movd                  m6, [ref1q+%3]
     20   movd                  m4, [ref2q+%3]
     21   movd                  m7, [ref3q+%3]
     22   movd                  m5, [ref4q+%3]
     23   punpckldq             m0, [srcq +%4]
     24   punpckldq             m6, [ref1q+%5]
     25   punpckldq             m4, [ref2q+%5]
     26   punpckldq             m7, [ref3q+%5]
     27   punpckldq             m5, [ref4q+%5]
     28   psadbw                m6, m0
     29   psadbw                m4, m0
     30   psadbw                m7, m0
     31   psadbw                m5, m0
     32   punpckldq             m6, m4
     33   punpckldq             m7, m5
     34 %else
     35   movd                  m1, [ref1q+%3]
     36   movd                  m2, [ref2q+%3]
     37   movd                  m3, [ref3q+%3]
     38   movd                  m4, [ref4q+%3]
     39   punpckldq             m0, [srcq +%4]
     40   punpckldq             m1, [ref1q+%5]
     41   punpckldq             m2, [ref2q+%5]
     42   punpckldq             m3, [ref3q+%5]
     43   punpckldq             m4, [ref4q+%5]
     44   psadbw                m1, m0
     45   psadbw                m2, m0
     46   psadbw                m3, m0
     47   psadbw                m4, m0
     48   punpckldq             m1, m2
     49   punpckldq             m3, m4
     50   paddd                 m6, m1
     51   paddd                 m7, m3
     52 %endif
     53 %if %6 == 1
     54   lea                 srcq, [srcq +src_strideq*2]
     55   lea                ref1q, [ref1q+ref_strideq*2]
     56   lea                ref2q, [ref2q+ref_strideq*2]
     57   lea                ref3q, [ref3q+ref_strideq*2]
     58   lea                ref4q, [ref4q+ref_strideq*2]
     59 %endif
     60 %endmacro
     61 
     62 ; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
     63 %macro PROCESS_8x2x4 5-6 0
     64   movh                  m0, [srcq +%2]
     65 %if %1 == 1
     66   movh                  m4, [ref1q+%3]
     67   movh                  m5, [ref2q+%3]
     68   movh                  m6, [ref3q+%3]
     69   movh                  m7, [ref4q+%3]
     70   movhps                m0, [srcq +%4]
     71   movhps                m4, [ref1q+%5]
     72   movhps                m5, [ref2q+%5]
     73   movhps                m6, [ref3q+%5]
     74   movhps                m7, [ref4q+%5]
     75   psadbw                m4, m0
     76   psadbw                m5, m0
     77   psadbw                m6, m0
     78   psadbw                m7, m0
     79 %else
     80   movh                  m1, [ref1q+%3]
     81   movh                  m2, [ref2q+%3]
     82   movh                  m3, [ref3q+%3]
     83   movhps                m0, [srcq +%4]
     84   movhps                m1, [ref1q+%5]
     85   movhps                m2, [ref2q+%5]
     86   movhps                m3, [ref3q+%5]
     87   psadbw                m1, m0
     88   psadbw                m2, m0
     89   psadbw                m3, m0
     90   paddd                 m4, m1
     91   movh                  m1, [ref4q+%3]
     92   movhps                m1, [ref4q+%5]
     93   paddd                 m5, m2
     94   paddd                 m6, m3
     95   psadbw                m1, m0
     96   paddd                 m7, m1
     97 %endif
     98 %if %6 == 1
     99   lea                 srcq, [srcq +src_strideq*2]
    100   lea                ref1q, [ref1q+ref_strideq*2]
    101   lea                ref2q, [ref2q+ref_strideq*2]
    102   lea                ref3q, [ref3q+ref_strideq*2]
    103   lea                ref4q, [ref4q+ref_strideq*2]
    104 %endif
    105 %endmacro
    106 
    107 ; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
    108 %macro PROCESS_16x2x4 5-6 0
    109   ; 1st 16 px
    110   mova                  m0, [srcq +%2]
    111 %if %1 == 1
    112   movu                  m4, [ref1q+%3]
    113   movu                  m5, [ref2q+%3]
    114   movu                  m6, [ref3q+%3]
    115   movu                  m7, [ref4q+%3]
    116   psadbw                m4, m0
    117   psadbw                m5, m0
    118   psadbw                m6, m0
    119   psadbw                m7, m0
    120 %else
    121   movu                  m1, [ref1q+%3]
    122   movu                  m2, [ref2q+%3]
    123   movu                  m3, [ref3q+%3]
    124   psadbw                m1, m0
    125   psadbw                m2, m0
    126   psadbw                m3, m0
    127   paddd                 m4, m1
    128   movu                  m1, [ref4q+%3]
    129   paddd                 m5, m2
    130   paddd                 m6, m3
    131   psadbw                m1, m0
    132   paddd                 m7, m1
    133 %endif
    134 
    135   ; 2nd 16 px
    136   mova                  m0, [srcq +%4]
    137   movu                  m1, [ref1q+%5]
    138   movu                  m2, [ref2q+%5]
    139   movu                  m3, [ref3q+%5]
    140   psadbw                m1, m0
    141   psadbw                m2, m0
    142   psadbw                m3, m0
    143   paddd                 m4, m1
    144   movu                  m1, [ref4q+%5]
    145   paddd                 m5, m2
    146   paddd                 m6, m3
    147 %if %6 == 1
    148   lea                 srcq, [srcq +src_strideq*2]
    149   lea                ref1q, [ref1q+ref_strideq*2]
    150   lea                ref2q, [ref2q+ref_strideq*2]
    151   lea                ref3q, [ref3q+ref_strideq*2]
    152   lea                ref4q, [ref4q+ref_strideq*2]
    153 %endif
    154   psadbw                m1, m0
    155   paddd                 m7, m1
    156 %endmacro
    157 
    158 ; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
    159 %macro PROCESS_32x2x4 5-6 0
    160   PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
    161   PROCESS_16x2x4  0, %4, %5, %4 + 16, %5 + 16, %6
    162 %endmacro
    163 
    164 ; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
    165 %macro PROCESS_64x2x4 5-6 0
    166   PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
    167   PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6
    168 %endmacro
    169 
    170 ; void vp9_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
    171 ;                         uint8_t *ref[4], int ref_stride,
    172 ;                         unsigned int res[4]);
    173 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
    174 %macro SADNXN4D 2
    175 %if UNIX64
    176 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
    177                               res, ref2, ref3, ref4
    178 %else
    179 cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
    180                               ref2, ref3, ref4
    181 %endif
    182   movsxdifnidn src_strideq, src_strided
    183   movsxdifnidn ref_strideq, ref_strided
    184   mov                ref2q, [ref1q+gprsize*1]
    185   mov                ref3q, [ref1q+gprsize*2]
    186   mov                ref4q, [ref1q+gprsize*3]
    187   mov                ref1q, [ref1q+gprsize*0]
    188 
    189   PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
    190 %rep (%2-4)/2
    191   PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
    192 %endrep
    193   PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
    194 
    195 %if mmsize == 16
    196   pslldq                m5, 4
    197   pslldq                m7, 4
    198   por                   m4, m5
    199   por                   m6, m7
    200   mova                  m5, m4
    201   mova                  m7, m6
    202   punpcklqdq            m4, m6
    203   punpckhqdq            m5, m7
    204   movifnidn             r4, r4mp
    205   paddd                 m4, m5
    206   movu                [r4], m4
    207   RET
    208 %else
    209   movifnidn             r4, r4mp
    210   movq               [r4+0], m6
    211   movq               [r4+8], m7
    212   RET
    213 %endif
    214 %endmacro
    215 
    216 INIT_XMM sse2
    217 SADNXN4D 64, 64
    218 SADNXN4D 64, 32
    219 SADNXN4D 32, 64
    220 SADNXN4D 32, 32
    221 SADNXN4D 32, 16
    222 SADNXN4D 16, 32
    223 SADNXN4D 16, 16
    224 SADNXN4D 16,  8
    225 SADNXN4D  8, 16
    226 SADNXN4D  8,  8
    227 SADNXN4D  8,  4
    228 
    229 INIT_MMX sse
    230 SADNXN4D  4,  8
    231 SADNXN4D  4,  4
    232