Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %include "third_party/x86inc/x86inc.asm"
     12 
     13 SECTION .text
     14 
     15 %macro convolve_fn 1
     16 INIT_XMM sse2
     17 cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
     18                               fx, fxs, fy, fys, w, h
     19   mov r4d, dword wm
     20   cmp r4d, 4
     21   je .w4
     22   cmp r4d, 8
     23   je .w8
     24   cmp r4d, 16
     25   je .w16
     26   cmp r4d, 32
     27   je .w32
     28 
     29   mov                    r4d, dword hm
     30 .loop64:
     31   movu                    m0, [srcq]
     32   movu                    m1, [srcq+16]
     33   movu                    m2, [srcq+32]
     34   movu                    m3, [srcq+48]
     35   add                   srcq, src_strideq
     36 %ifidn %1, avg
     37   pavgb                   m0, [dstq]
     38   pavgb                   m1, [dstq+16]
     39   pavgb                   m2, [dstq+32]
     40   pavgb                   m3, [dstq+48]
     41 %endif
     42   mova             [dstq   ], m0
     43   mova             [dstq+16], m1
     44   mova             [dstq+32], m2
     45   mova             [dstq+48], m3
     46   add                   dstq, dst_strideq
     47   dec                    r4d
     48   jnz .loop64
     49   RET
     50 
     51 .w32:
     52   mov                    r4d, dword hm
     53 .loop32:
     54   movu                    m0, [srcq]
     55   movu                    m1, [srcq+16]
     56   movu                    m2, [srcq+src_strideq]
     57   movu                    m3, [srcq+src_strideq+16]
     58   lea                   srcq, [srcq+src_strideq*2]
     59 %ifidn %1, avg
     60   pavgb                   m0, [dstq]
     61   pavgb                   m1, [dstq            +16]
     62   pavgb                   m2, [dstq+dst_strideq]
     63   pavgb                   m3, [dstq+dst_strideq+16]
     64 %endif
     65   mova [dstq               ], m0
     66   mova [dstq            +16], m1
     67   mova [dstq+dst_strideq   ], m2
     68   mova [dstq+dst_strideq+16], m3
     69   lea                   dstq, [dstq+dst_strideq*2]
     70   sub                    r4d, 2
     71   jnz .loop32
     72   RET
     73 
     74 .w16:
     75   mov                    r4d, dword hm
     76   lea                    r5q, [src_strideq*3]
     77   lea                    r6q, [dst_strideq*3]
     78 .loop16:
     79   movu                    m0, [srcq]
     80   movu                    m1, [srcq+src_strideq]
     81   movu                    m2, [srcq+src_strideq*2]
     82   movu                    m3, [srcq+r5q]
     83   lea                   srcq, [srcq+src_strideq*4]
     84 %ifidn %1, avg
     85   pavgb                   m0, [dstq]
     86   pavgb                   m1, [dstq+dst_strideq]
     87   pavgb                   m2, [dstq+dst_strideq*2]
     88   pavgb                   m3, [dstq+r6q]
     89 %endif
     90   mova  [dstq              ], m0
     91   mova  [dstq+dst_strideq  ], m1
     92   mova  [dstq+dst_strideq*2], m2
     93   mova  [dstq+r6q          ], m3
     94   lea                   dstq, [dstq+dst_strideq*4]
     95   sub                    r4d, 4
     96   jnz .loop16
     97   RET
     98 
     99 INIT_MMX sse
    100 .w8:
    101   mov                    r4d, dword hm
    102   lea                    r5q, [src_strideq*3]
    103   lea                    r6q, [dst_strideq*3]
    104 .loop8:
    105   movu                    m0, [srcq]
    106   movu                    m1, [srcq+src_strideq]
    107   movu                    m2, [srcq+src_strideq*2]
    108   movu                    m3, [srcq+r5q]
    109   lea                   srcq, [srcq+src_strideq*4]
    110 %ifidn %1, avg
    111   pavgb                   m0, [dstq]
    112   pavgb                   m1, [dstq+dst_strideq]
    113   pavgb                   m2, [dstq+dst_strideq*2]
    114   pavgb                   m3, [dstq+r6q]
    115 %endif
    116   mova  [dstq              ], m0
    117   mova  [dstq+dst_strideq  ], m1
    118   mova  [dstq+dst_strideq*2], m2
    119   mova  [dstq+r6q          ], m3
    120   lea                   dstq, [dstq+dst_strideq*4]
    121   sub                    r4d, 4
    122   jnz .loop8
    123   RET
    124 
    125 .w4:
    126   mov                    r4d, dword hm
    127   lea                    r5q, [src_strideq*3]
    128   lea                    r6q, [dst_strideq*3]
    129 .loop4:
    130   movh                    m0, [srcq]
    131   movh                    m1, [srcq+src_strideq]
    132   movh                    m2, [srcq+src_strideq*2]
    133   movh                    m3, [srcq+r5q]
    134   lea                   srcq, [srcq+src_strideq*4]
    135 %ifidn %1, avg
    136   pavgb                   m0, [dstq]
    137   pavgb                   m1, [dstq+dst_strideq]
    138   pavgb                   m2, [dstq+dst_strideq*2]
    139   pavgb                   m3, [dstq+r6q]
    140 %endif
    141   movh  [dstq              ], m0
    142   movh  [dstq+dst_strideq  ], m1
    143   movh  [dstq+dst_strideq*2], m2
    144   movh  [dstq+r6q          ], m3
    145   lea                   dstq, [dstq+dst_strideq*4]
    146   sub                    r4d, 4
    147   jnz .loop4
    148   RET
    149 %endmacro
    150 
    151 convolve_fn copy
    152 convolve_fn avg
    153