Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %include "third_party/x86inc/x86inc.asm"
     12 
     13 SECTION .text
     14 
     15 ; void vpx_subtract_block(int rows, int cols,
     16 ;                         int16_t *diff, ptrdiff_t diff_stride,
     17 ;                         const uint8_t *src, ptrdiff_t src_stride,
     18 ;                         const uint8_t *pred, ptrdiff_t pred_stride)
     19 
     20 INIT_XMM sse2
     21 cglobal subtract_block, 7, 7, 8, \
     22                         rows, cols, diff, diff_stride, src, src_stride, \
     23                         pred, pred_stride
     24 %define pred_str colsq
     25   pxor                  m7, m7         ; dedicated zero register
     26   cmp                colsd, 4
     27   je .case_4
     28   cmp                colsd, 8
     29   je .case_8
     30   cmp                colsd, 16
     31   je .case_16
     32   cmp                colsd, 32
     33   je .case_32
     34 
     35 %macro loop16 6
     36   mova                  m0, [srcq+%1]
     37   mova                  m4, [srcq+%2]
     38   mova                  m1, [predq+%3]
     39   mova                  m5, [predq+%4]
     40   punpckhbw             m2, m0, m7
     41   punpckhbw             m3, m1, m7
     42   punpcklbw             m0, m7
     43   punpcklbw             m1, m7
     44   psubw                 m2, m3
     45   psubw                 m0, m1
     46   punpckhbw             m1, m4, m7
     47   punpckhbw             m3, m5, m7
     48   punpcklbw             m4, m7
     49   punpcklbw             m5, m7
     50   psubw                 m1, m3
     51   psubw                 m4, m5
     52   mova [diffq+mmsize*0+%5], m0
     53   mova [diffq+mmsize*1+%5], m2
     54   mova [diffq+mmsize*0+%6], m4
     55   mova [diffq+mmsize*1+%6], m1
     56 %endmacro
     57 
     58   mov             pred_str, pred_stridemp
     59 .loop_64:
     60   loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
     61   loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
     62   lea                diffq, [diffq+diff_strideq*2]
     63   add                predq, pred_str
     64   add                 srcq, src_strideq
     65   dec                rowsd
     66   jg .loop_64
     67   RET
     68 
     69 .case_32:
     70   mov             pred_str, pred_stridemp
     71 .loop_32:
     72   loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
     73   lea                diffq, [diffq+diff_strideq*2]
     74   add                predq, pred_str
     75   add                 srcq, src_strideq
     76   dec                rowsd
     77   jg .loop_32
     78   RET
     79 
     80 .case_16:
     81   mov             pred_str, pred_stridemp
     82 .loop_16:
     83   loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
     84   lea                diffq, [diffq+diff_strideq*4]
     85   lea                predq, [predq+pred_str*2]
     86   lea                 srcq, [srcq+src_strideq*2]
     87   sub                rowsd, 2
     88   jg .loop_16
     89   RET
     90 
     91 %macro loop_h 0
     92   movh                  m0, [srcq]
     93   movh                  m2, [srcq+src_strideq]
     94   movh                  m1, [predq]
     95   movh                  m3, [predq+pred_str]
     96   punpcklbw             m0, m7
     97   punpcklbw             m1, m7
     98   punpcklbw             m2, m7
     99   punpcklbw             m3, m7
    100   psubw                 m0, m1
    101   psubw                 m2, m3
    102   mova             [diffq], m0
    103   mova [diffq+diff_strideq*2], m2
    104 %endmacro
    105 
    106 .case_8:
    107   mov             pred_str, pred_stridemp
    108 .loop_8:
    109   loop_h
    110   lea                diffq, [diffq+diff_strideq*4]
    111   lea                 srcq, [srcq+src_strideq*2]
    112   lea                predq, [predq+pred_str*2]
    113   sub                rowsd, 2
    114   jg .loop_8
    115   RET
    116 
    117 INIT_MMX
    118 .case_4:
    119   mov             pred_str, pred_stridemp
    120 .loop_4:
    121   loop_h
    122   lea                diffq, [diffq+diff_strideq*4]
    123   lea                 srcq, [srcq+src_strideq*2]
    124   lea                predq, [predq+pred_str*2]
    125   sub                rowsd, 2
    126   jg .loop_4
    127   RET
    128