Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 %macro PROCESS_16X2X3 1
     15 %if %1
     16         movdqa          xmm0,       XMMWORD PTR [rsi]
     17         lddqu           xmm5,       XMMWORD PTR [rdi]
     18         lddqu           xmm6,       XMMWORD PTR [rdi+1]
     19         lddqu           xmm7,       XMMWORD PTR [rdi+2]
     20 
     21         psadbw          xmm5,       xmm0
     22         psadbw          xmm6,       xmm0
     23         psadbw          xmm7,       xmm0
     24 %else
     25         movdqa          xmm0,       XMMWORD PTR [rsi]
     26         lddqu           xmm1,       XMMWORD PTR [rdi]
     27         lddqu           xmm2,       XMMWORD PTR [rdi+1]
     28         lddqu           xmm3,       XMMWORD PTR [rdi+2]
     29 
     30         psadbw          xmm1,       xmm0
     31         psadbw          xmm2,       xmm0
     32         psadbw          xmm3,       xmm0
     33 
     34         paddw           xmm5,       xmm1
     35         paddw           xmm6,       xmm2
     36         paddw           xmm7,       xmm3
     37 %endif
     38         movdqa          xmm0,       XMMWORD PTR [rsi+rax]
     39         lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
     40         lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
     41         lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
     42 
     43         lea             rsi,        [rsi+rax*2]
     44         lea             rdi,        [rdi+rdx*2]
     45 
     46         psadbw          xmm1,       xmm0
     47         psadbw          xmm2,       xmm0
     48         psadbw          xmm3,       xmm0
     49 
     50         paddw           xmm5,       xmm1
     51         paddw           xmm6,       xmm2
     52         paddw           xmm7,       xmm3
     53 %endmacro
     54 
     55 %macro PROCESS_16X2X3_OFFSET 2
     56 %if %1
     57         movdqa          xmm0,       XMMWORD PTR [rsi]
     58         movdqa          xmm4,       XMMWORD PTR [rdi]
     59         movdqa          xmm7,       XMMWORD PTR [rdi+16]
     60 
     61         movdqa          xmm5,       xmm7
     62         palignr         xmm5,       xmm4,       %2
     63 
     64         movdqa          xmm6,       xmm7
     65         palignr         xmm6,       xmm4,       (%2+1)
     66 
     67         palignr         xmm7,       xmm4,       (%2+2)
     68 
     69         psadbw          xmm5,       xmm0
     70         psadbw          xmm6,       xmm0
     71         psadbw          xmm7,       xmm0
     72 %else
     73         movdqa          xmm0,       XMMWORD PTR [rsi]
     74         movdqa          xmm4,       XMMWORD PTR [rdi]
     75         movdqa          xmm3,       XMMWORD PTR [rdi+16]
     76 
     77         movdqa          xmm1,       xmm3
     78         palignr         xmm1,       xmm4,       %2
     79 
     80         movdqa          xmm2,       xmm3
     81         palignr         xmm2,       xmm4,       (%2+1)
     82 
     83         palignr         xmm3,       xmm4,       (%2+2)
     84 
     85         psadbw          xmm1,       xmm0
     86         psadbw          xmm2,       xmm0
     87         psadbw          xmm3,       xmm0
     88 
     89         paddw           xmm5,       xmm1
     90         paddw           xmm6,       xmm2
     91         paddw           xmm7,       xmm3
     92 %endif
     93         movdqa          xmm0,       XMMWORD PTR [rsi+rax]
     94         movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
     95         movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
     96 
     97         movdqa          xmm1,       xmm3
     98         palignr         xmm1,       xmm4,       %2
     99 
    100         movdqa          xmm2,       xmm3
    101         palignr         xmm2,       xmm4,       (%2+1)
    102 
    103         palignr         xmm3,       xmm4,       (%2+2)
    104 
    105         lea             rsi,        [rsi+rax*2]
    106         lea             rdi,        [rdi+rdx*2]
    107 
    108         psadbw          xmm1,       xmm0
    109         psadbw          xmm2,       xmm0
    110         psadbw          xmm3,       xmm0
    111 
    112         paddw           xmm5,       xmm1
    113         paddw           xmm6,       xmm2
    114         paddw           xmm7,       xmm3
    115 %endmacro
    116 
    117 %macro PROCESS_16X16X3_OFFSET 2
    118 %2_aligned_by_%1:
    119 
    120         sub             rdi,        %1
    121 
    122         PROCESS_16X2X3_OFFSET 1, %1
    123         PROCESS_16X2X3_OFFSET 0, %1
    124         PROCESS_16X2X3_OFFSET 0, %1
    125         PROCESS_16X2X3_OFFSET 0, %1
    126         PROCESS_16X2X3_OFFSET 0, %1
    127         PROCESS_16X2X3_OFFSET 0, %1
    128         PROCESS_16X2X3_OFFSET 0, %1
    129         PROCESS_16X2X3_OFFSET 0, %1
    130 
    131         jmp             %2_store_off
    132 
    133 %endmacro
    134 
    135 %macro PROCESS_16X8X3_OFFSET 2
    136 %2_aligned_by_%1:
    137 
    138         sub             rdi,        %1
    139 
    140         PROCESS_16X2X3_OFFSET 1, %1
    141         PROCESS_16X2X3_OFFSET 0, %1
    142         PROCESS_16X2X3_OFFSET 0, %1
    143         PROCESS_16X2X3_OFFSET 0, %1
    144 
    145         jmp             %2_store_off
    146 
    147 %endmacro
    148 
    149 ;void int vp9_sad16x16x3_ssse3(
    150 ;    unsigned char *src_ptr,
    151 ;    int  src_stride,
    152 ;    unsigned char *ref_ptr,
    153 ;    int  ref_stride,
    154 ;    int  *results)
    155 global sym(vp9_sad16x16x3_ssse3) PRIVATE
    156 sym(vp9_sad16x16x3_ssse3):
    157     push        rbp
    158     mov         rbp, rsp
    159     SHADOW_ARGS_TO_STACK 5
    160     SAVE_XMM 7
    161     push        rsi
    162     push        rdi
    163     push        rcx
    164     ; end prolog
    165 
    166         mov             rsi,        arg(0) ;src_ptr
    167         mov             rdi,        arg(2) ;ref_ptr
    168 
    169         mov             rdx,        0xf
    170         and             rdx,        rdi
    171 
    172         jmp .vp9_sad16x16x3_ssse3_skiptable
    173 .vp9_sad16x16x3_ssse3_jumptable:
    174         dd .vp9_sad16x16x3_ssse3_aligned_by_0  - .vp9_sad16x16x3_ssse3_do_jump
    175         dd .vp9_sad16x16x3_ssse3_aligned_by_1  - .vp9_sad16x16x3_ssse3_do_jump
    176         dd .vp9_sad16x16x3_ssse3_aligned_by_2  - .vp9_sad16x16x3_ssse3_do_jump
    177         dd .vp9_sad16x16x3_ssse3_aligned_by_3  - .vp9_sad16x16x3_ssse3_do_jump
    178         dd .vp9_sad16x16x3_ssse3_aligned_by_4  - .vp9_sad16x16x3_ssse3_do_jump
    179         dd .vp9_sad16x16x3_ssse3_aligned_by_5  - .vp9_sad16x16x3_ssse3_do_jump
    180         dd .vp9_sad16x16x3_ssse3_aligned_by_6  - .vp9_sad16x16x3_ssse3_do_jump
    181         dd .vp9_sad16x16x3_ssse3_aligned_by_7  - .vp9_sad16x16x3_ssse3_do_jump
    182         dd .vp9_sad16x16x3_ssse3_aligned_by_8  - .vp9_sad16x16x3_ssse3_do_jump
    183         dd .vp9_sad16x16x3_ssse3_aligned_by_9  - .vp9_sad16x16x3_ssse3_do_jump
    184         dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump
    185         dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump
    186         dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump
    187         dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump
    188         dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump
    189         dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump
    190 .vp9_sad16x16x3_ssse3_skiptable:
    191 
    192         call .vp9_sad16x16x3_ssse3_do_jump
    193 .vp9_sad16x16x3_ssse3_do_jump:
    194         pop             rcx                         ; get the address of do_jump
    195         mov             rax,  .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump
    196         add             rax,  rcx  ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable
    197 
    198         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
    199         add             rcx,        rax
    200 
    201         movsxd          rax,        dword ptr arg(1) ;src_stride
    202         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    203 
    204         jmp             rcx
    205 
    206         PROCESS_16X16X3_OFFSET 0,  .vp9_sad16x16x3_ssse3
    207         PROCESS_16X16X3_OFFSET 1,  .vp9_sad16x16x3_ssse3
    208         PROCESS_16X16X3_OFFSET 2,  .vp9_sad16x16x3_ssse3
    209         PROCESS_16X16X3_OFFSET 3,  .vp9_sad16x16x3_ssse3
    210         PROCESS_16X16X3_OFFSET 4,  .vp9_sad16x16x3_ssse3
    211         PROCESS_16X16X3_OFFSET 5,  .vp9_sad16x16x3_ssse3
    212         PROCESS_16X16X3_OFFSET 6,  .vp9_sad16x16x3_ssse3
    213         PROCESS_16X16X3_OFFSET 7,  .vp9_sad16x16x3_ssse3
    214         PROCESS_16X16X3_OFFSET 8,  .vp9_sad16x16x3_ssse3
    215         PROCESS_16X16X3_OFFSET 9,  .vp9_sad16x16x3_ssse3
    216         PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3
    217         PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3
    218         PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3
    219         PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3
    220         PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3
    221 
    222 .vp9_sad16x16x3_ssse3_aligned_by_15:
    223         PROCESS_16X2X3 1
    224         PROCESS_16X2X3 0
    225         PROCESS_16X2X3 0
    226         PROCESS_16X2X3 0
    227         PROCESS_16X2X3 0
    228         PROCESS_16X2X3 0
    229         PROCESS_16X2X3 0
    230         PROCESS_16X2X3 0
    231 
    232 .vp9_sad16x16x3_ssse3_store_off:
    233         mov             rdi,        arg(4) ;Results
    234 
    235         movq            xmm0,       xmm5
    236         psrldq          xmm5,       8
    237 
    238         paddw           xmm0,       xmm5
    239         movd            [rdi],      xmm0
    240 ;-
    241         movq            xmm0,       xmm6
    242         psrldq          xmm6,       8
    243 
    244         paddw           xmm0,       xmm6
    245         movd            [rdi+4],    xmm0
    246 ;-
    247         movq            xmm0,       xmm7
    248         psrldq          xmm7,       8
    249 
    250         paddw           xmm0,       xmm7
    251         movd            [rdi+8],    xmm0
    252 
    253     ; begin epilog
    254     pop         rcx
    255     pop         rdi
    256     pop         rsi
    257     RESTORE_XMM
    258     UNSHADOW_ARGS
    259     pop         rbp
    260     ret
    261 
    262 ;void int vp9_sad16x8x3_ssse3(
    263 ;    unsigned char *src_ptr,
    264 ;    int  src_stride,
    265 ;    unsigned char *ref_ptr,
    266 ;    int  ref_stride,
    267 ;    int  *results)
    268 global sym(vp9_sad16x8x3_ssse3) PRIVATE
    269 sym(vp9_sad16x8x3_ssse3):
    270     push        rbp
    271     mov         rbp, rsp
    272     SHADOW_ARGS_TO_STACK 5
    273     SAVE_XMM 7
    274     push        rsi
    275     push        rdi
    276     push        rcx
    277     ; end prolog
    278 
    279         mov             rsi,        arg(0) ;src_ptr
    280         mov             rdi,        arg(2) ;ref_ptr
    281 
    282         mov             rdx,        0xf
    283         and             rdx,        rdi
    284 
    285         jmp .vp9_sad16x8x3_ssse3_skiptable
    286 .vp9_sad16x8x3_ssse3_jumptable:
    287         dd .vp9_sad16x8x3_ssse3_aligned_by_0  - .vp9_sad16x8x3_ssse3_do_jump
    288         dd .vp9_sad16x8x3_ssse3_aligned_by_1  - .vp9_sad16x8x3_ssse3_do_jump
    289         dd .vp9_sad16x8x3_ssse3_aligned_by_2  - .vp9_sad16x8x3_ssse3_do_jump
    290         dd .vp9_sad16x8x3_ssse3_aligned_by_3  - .vp9_sad16x8x3_ssse3_do_jump
    291         dd .vp9_sad16x8x3_ssse3_aligned_by_4  - .vp9_sad16x8x3_ssse3_do_jump
    292         dd .vp9_sad16x8x3_ssse3_aligned_by_5  - .vp9_sad16x8x3_ssse3_do_jump
    293         dd .vp9_sad16x8x3_ssse3_aligned_by_6  - .vp9_sad16x8x3_ssse3_do_jump
    294         dd .vp9_sad16x8x3_ssse3_aligned_by_7  - .vp9_sad16x8x3_ssse3_do_jump
    295         dd .vp9_sad16x8x3_ssse3_aligned_by_8  - .vp9_sad16x8x3_ssse3_do_jump
    296         dd .vp9_sad16x8x3_ssse3_aligned_by_9  - .vp9_sad16x8x3_ssse3_do_jump
    297         dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump
    298         dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump
    299         dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump
    300         dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump
    301         dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump
    302         dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump
    303 .vp9_sad16x8x3_ssse3_skiptable:
    304 
    305         call .vp9_sad16x8x3_ssse3_do_jump
    306 .vp9_sad16x8x3_ssse3_do_jump:
    307         pop             rcx                         ; get the address of do_jump
    308         mov             rax,  .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump
    309         add             rax,  rcx  ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable
    310 
    311         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
    312         add             rcx,        rax
    313 
    314         movsxd          rax,        dword ptr arg(1) ;src_stride
    315         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    316 
    317         jmp             rcx
    318 
    319         PROCESS_16X8X3_OFFSET 0,  .vp9_sad16x8x3_ssse3
    320         PROCESS_16X8X3_OFFSET 1,  .vp9_sad16x8x3_ssse3
    321         PROCESS_16X8X3_OFFSET 2,  .vp9_sad16x8x3_ssse3
    322         PROCESS_16X8X3_OFFSET 3,  .vp9_sad16x8x3_ssse3
    323         PROCESS_16X8X3_OFFSET 4,  .vp9_sad16x8x3_ssse3
    324         PROCESS_16X8X3_OFFSET 5,  .vp9_sad16x8x3_ssse3
    325         PROCESS_16X8X3_OFFSET 6,  .vp9_sad16x8x3_ssse3
    326         PROCESS_16X8X3_OFFSET 7,  .vp9_sad16x8x3_ssse3
    327         PROCESS_16X8X3_OFFSET 8,  .vp9_sad16x8x3_ssse3
    328         PROCESS_16X8X3_OFFSET 9,  .vp9_sad16x8x3_ssse3
    329         PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3
    330         PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3
    331         PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3
    332         PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3
    333         PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3
    334 
    335 .vp9_sad16x8x3_ssse3_aligned_by_15:
    336 
    337         PROCESS_16X2X3 1
    338         PROCESS_16X2X3 0
    339         PROCESS_16X2X3 0
    340         PROCESS_16X2X3 0
    341 
    342 .vp9_sad16x8x3_ssse3_store_off:
    343         mov             rdi,        arg(4) ;Results
    344 
    345         movq            xmm0,       xmm5
    346         psrldq          xmm5,       8
    347 
    348         paddw           xmm0,       xmm5
    349         movd            [rdi],      xmm0
    350 ;-
    351         movq            xmm0,       xmm6
    352         psrldq          xmm6,       8
    353 
    354         paddw           xmm0,       xmm6
    355         movd            [rdi+4],    xmm0
    356 ;-
    357         movq            xmm0,       xmm7
    358         psrldq          xmm7,       8
    359 
    360         paddw           xmm0,       xmm7
    361         movd            [rdi+8],    xmm0
    362 
    363     ; begin epilog
    364     pop         rcx
    365     pop         rdi
    366     pop         rsi
    367     RESTORE_XMM
    368     UNSHADOW_ARGS
    369     pop         rbp
    370     ret
    371