Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 %macro PROCESS_16X2X3 1
     15 %if %1
     16         movdqa          xmm0,       XMMWORD PTR [rsi]
     17         lddqu           xmm5,       XMMWORD PTR [rdi]
     18         lddqu           xmm6,       XMMWORD PTR [rdi+1]
     19         lddqu           xmm7,       XMMWORD PTR [rdi+2]
     20 
     21         psadbw          xmm5,       xmm0
     22         psadbw          xmm6,       xmm0
     23         psadbw          xmm7,       xmm0
     24 %else
     25         movdqa          xmm0,       XMMWORD PTR [rsi]
     26         lddqu           xmm1,       XMMWORD PTR [rdi]
     27         lddqu           xmm2,       XMMWORD PTR [rdi+1]
     28         lddqu           xmm3,       XMMWORD PTR [rdi+2]
     29 
     30         psadbw          xmm1,       xmm0
     31         psadbw          xmm2,       xmm0
     32         psadbw          xmm3,       xmm0
     33 
     34         paddw           xmm5,       xmm1
     35         paddw           xmm6,       xmm2
     36         paddw           xmm7,       xmm3
     37 %endif
     38         movdqa          xmm0,       XMMWORD PTR [rsi+rax]
     39         lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
     40         lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
     41         lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
     42 
     43         lea             rsi,        [rsi+rax*2]
     44         lea             rdi,        [rdi+rdx*2]
     45 
     46         psadbw          xmm1,       xmm0
     47         psadbw          xmm2,       xmm0
     48         psadbw          xmm3,       xmm0
     49 
     50         paddw           xmm5,       xmm1
     51         paddw           xmm6,       xmm2
     52         paddw           xmm7,       xmm3
     53 %endmacro
     54 
     55 %macro PROCESS_16X2X3_OFFSET 2
     56 %if %1
     57         movdqa          xmm0,       XMMWORD PTR [rsi]
     58         movdqa          xmm4,       XMMWORD PTR [rdi]
     59         movdqa          xmm7,       XMMWORD PTR [rdi+16]
     60 
     61         movdqa          xmm5,       xmm7
     62         palignr         xmm5,       xmm4,       %2
     63 
     64         movdqa          xmm6,       xmm7
     65         palignr         xmm6,       xmm4,       (%2+1)
     66 
     67         palignr         xmm7,       xmm4,       (%2+2)
     68 
     69         psadbw          xmm5,       xmm0
     70         psadbw          xmm6,       xmm0
     71         psadbw          xmm7,       xmm0
     72 %else
     73         movdqa          xmm0,       XMMWORD PTR [rsi]
     74         movdqa          xmm4,       XMMWORD PTR [rdi]
     75         movdqa          xmm3,       XMMWORD PTR [rdi+16]
     76 
     77         movdqa          xmm1,       xmm3
     78         palignr         xmm1,       xmm4,       %2
     79 
     80         movdqa          xmm2,       xmm3
     81         palignr         xmm2,       xmm4,       (%2+1)
     82 
     83         palignr         xmm3,       xmm4,       (%2+2)
     84 
     85         psadbw          xmm1,       xmm0
     86         psadbw          xmm2,       xmm0
     87         psadbw          xmm3,       xmm0
     88 
     89         paddw           xmm5,       xmm1
     90         paddw           xmm6,       xmm2
     91         paddw           xmm7,       xmm3
     92 %endif
     93         movdqa          xmm0,       XMMWORD PTR [rsi+rax]
     94         movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
     95         movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
     96 
     97         movdqa          xmm1,       xmm3
     98         palignr         xmm1,       xmm4,       %2
     99 
    100         movdqa          xmm2,       xmm3
    101         palignr         xmm2,       xmm4,       (%2+1)
    102 
    103         palignr         xmm3,       xmm4,       (%2+2)
    104 
    105         lea             rsi,        [rsi+rax*2]
    106         lea             rdi,        [rdi+rdx*2]
    107 
    108         psadbw          xmm1,       xmm0
    109         psadbw          xmm2,       xmm0
    110         psadbw          xmm3,       xmm0
    111 
    112         paddw           xmm5,       xmm1
    113         paddw           xmm6,       xmm2
    114         paddw           xmm7,       xmm3
    115 %endmacro
    116 
    117 %macro PROCESS_16X16X3_OFFSET 2
    118 %2_aligned_by_%1:
    119 
    120         sub             rdi,        %1
    121 
    122         PROCESS_16X2X3_OFFSET 1, %1
    123         PROCESS_16X2X3_OFFSET 0, %1
    124         PROCESS_16X2X3_OFFSET 0, %1
    125         PROCESS_16X2X3_OFFSET 0, %1
    126         PROCESS_16X2X3_OFFSET 0, %1
    127         PROCESS_16X2X3_OFFSET 0, %1
    128         PROCESS_16X2X3_OFFSET 0, %1
    129         PROCESS_16X2X3_OFFSET 0, %1
    130 
    131         jmp             %2_store_off
    132 
    133 %endmacro
    134 
    135 %macro PROCESS_16X8X3_OFFSET 2
    136 %2_aligned_by_%1:
    137 
    138         sub             rdi,        %1
    139 
    140         PROCESS_16X2X3_OFFSET 1, %1
    141         PROCESS_16X2X3_OFFSET 0, %1
    142         PROCESS_16X2X3_OFFSET 0, %1
    143         PROCESS_16X2X3_OFFSET 0, %1
    144 
    145         jmp             %2_store_off
    146 
    147 %endmacro
    148 
    149 ;void int vp8_sad16x16x3_ssse3(
    150 ;    unsigned char *src_ptr,
    151 ;    int  src_stride,
    152 ;    unsigned char *ref_ptr,
    153 ;    int  ref_stride,
    154 ;    int  *results)
    155 global sym(vp8_sad16x16x3_ssse3)
    156 sym(vp8_sad16x16x3_ssse3):
    157     push        rbp
    158     mov         rbp, rsp
    159     SHADOW_ARGS_TO_STACK 5
    160     push        rsi
    161     push        rdi
    162     push        rcx
    163     ; end prolog
    164 
    165         mov             rsi,        arg(0) ;src_ptr
    166         mov             rdi,        arg(2) ;ref_ptr
    167 
    168         mov             rdx,        0xf
    169         and             rdx,        rdi
    170 
    171         jmp vp8_sad16x16x3_ssse3_skiptable
    172 vp8_sad16x16x3_ssse3_jumptable:
    173         dd vp8_sad16x16x3_ssse3_aligned_by_0  - vp8_sad16x16x3_ssse3_do_jump
    174         dd vp8_sad16x16x3_ssse3_aligned_by_1  - vp8_sad16x16x3_ssse3_do_jump
    175         dd vp8_sad16x16x3_ssse3_aligned_by_2  - vp8_sad16x16x3_ssse3_do_jump
    176         dd vp8_sad16x16x3_ssse3_aligned_by_3  - vp8_sad16x16x3_ssse3_do_jump
    177         dd vp8_sad16x16x3_ssse3_aligned_by_4  - vp8_sad16x16x3_ssse3_do_jump
    178         dd vp8_sad16x16x3_ssse3_aligned_by_5  - vp8_sad16x16x3_ssse3_do_jump
    179         dd vp8_sad16x16x3_ssse3_aligned_by_6  - vp8_sad16x16x3_ssse3_do_jump
    180         dd vp8_sad16x16x3_ssse3_aligned_by_7  - vp8_sad16x16x3_ssse3_do_jump
    181         dd vp8_sad16x16x3_ssse3_aligned_by_8  - vp8_sad16x16x3_ssse3_do_jump
    182         dd vp8_sad16x16x3_ssse3_aligned_by_9  - vp8_sad16x16x3_ssse3_do_jump
    183         dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump
    184         dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump
    185         dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump
    186         dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump
    187         dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump
    188         dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump
    189 vp8_sad16x16x3_ssse3_skiptable:
    190 
    191         call vp8_sad16x16x3_ssse3_do_jump
    192 vp8_sad16x16x3_ssse3_do_jump:
    193         pop             rcx                         ; get the address of do_jump
    194         mov             rax,  vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump
    195         add             rax,  rcx  ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
    196 
    197         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
    198         add             rcx,        rax
    199 
    200         movsxd          rax,        dword ptr arg(1) ;src_stride
    201         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    202 
    203         jmp             rcx
    204 
    205         PROCESS_16X16X3_OFFSET 0,  vp8_sad16x16x3_ssse3
    206         PROCESS_16X16X3_OFFSET 1,  vp8_sad16x16x3_ssse3
    207         PROCESS_16X16X3_OFFSET 2,  vp8_sad16x16x3_ssse3
    208         PROCESS_16X16X3_OFFSET 3,  vp8_sad16x16x3_ssse3
    209         PROCESS_16X16X3_OFFSET 4,  vp8_sad16x16x3_ssse3
    210         PROCESS_16X16X3_OFFSET 5,  vp8_sad16x16x3_ssse3
    211         PROCESS_16X16X3_OFFSET 6,  vp8_sad16x16x3_ssse3
    212         PROCESS_16X16X3_OFFSET 7,  vp8_sad16x16x3_ssse3
    213         PROCESS_16X16X3_OFFSET 8,  vp8_sad16x16x3_ssse3
    214         PROCESS_16X16X3_OFFSET 9,  vp8_sad16x16x3_ssse3
    215         PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3
    216         PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3
    217         PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3
    218         PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3
    219         PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3
    220 
    221 vp8_sad16x16x3_ssse3_aligned_by_15:
    222         PROCESS_16X2X3 1
    223         PROCESS_16X2X3 0
    224         PROCESS_16X2X3 0
    225         PROCESS_16X2X3 0
    226         PROCESS_16X2X3 0
    227         PROCESS_16X2X3 0
    228         PROCESS_16X2X3 0
    229         PROCESS_16X2X3 0
    230 
    231 vp8_sad16x16x3_ssse3_store_off:
    232         mov             rdi,        arg(4) ;Results
    233 
    234         movq            xmm0,       xmm5
    235         psrldq          xmm5,       8
    236 
    237         paddw           xmm0,       xmm5
    238         movd            [rdi],      xmm0
    239 ;-
    240         movq            xmm0,       xmm6
    241         psrldq          xmm6,       8
    242 
    243         paddw           xmm0,       xmm6
    244         movd            [rdi+4],    xmm0
    245 ;-
    246         movq            xmm0,       xmm7
    247         psrldq          xmm7,       8
    248 
    249         paddw           xmm0,       xmm7
    250         movd            [rdi+8],    xmm0
    251 
    252     ; begin epilog
    253     pop         rcx
    254     pop         rdi
    255     pop         rsi
    256     UNSHADOW_ARGS
    257     pop         rbp
    258     ret
    259 
    260 ;void int vp8_sad16x8x3_ssse3(
    261 ;    unsigned char *src_ptr,
    262 ;    int  src_stride,
    263 ;    unsigned char *ref_ptr,
    264 ;    int  ref_stride,
    265 ;    int  *results)
    266 global sym(vp8_sad16x8x3_ssse3)
    267 sym(vp8_sad16x8x3_ssse3):
    268     push        rbp
    269     mov         rbp, rsp
    270     SHADOW_ARGS_TO_STACK 5
    271     push        rsi
    272     push        rdi
    273     push        rcx
    274     ; end prolog
    275 
    276         mov             rsi,        arg(0) ;src_ptr
    277         mov             rdi,        arg(2) ;ref_ptr
    278 
    279         mov             rdx,        0xf
    280         and             rdx,        rdi
    281 
    282         jmp vp8_sad16x8x3_ssse3_skiptable
    283 vp8_sad16x8x3_ssse3_jumptable:
    284         dd vp8_sad16x8x3_ssse3_aligned_by_0  - vp8_sad16x8x3_ssse3_do_jump
    285         dd vp8_sad16x8x3_ssse3_aligned_by_1  - vp8_sad16x8x3_ssse3_do_jump
    286         dd vp8_sad16x8x3_ssse3_aligned_by_2  - vp8_sad16x8x3_ssse3_do_jump
    287         dd vp8_sad16x8x3_ssse3_aligned_by_3  - vp8_sad16x8x3_ssse3_do_jump
    288         dd vp8_sad16x8x3_ssse3_aligned_by_4  - vp8_sad16x8x3_ssse3_do_jump
    289         dd vp8_sad16x8x3_ssse3_aligned_by_5  - vp8_sad16x8x3_ssse3_do_jump
    290         dd vp8_sad16x8x3_ssse3_aligned_by_6  - vp8_sad16x8x3_ssse3_do_jump
    291         dd vp8_sad16x8x3_ssse3_aligned_by_7  - vp8_sad16x8x3_ssse3_do_jump
    292         dd vp8_sad16x8x3_ssse3_aligned_by_8  - vp8_sad16x8x3_ssse3_do_jump
    293         dd vp8_sad16x8x3_ssse3_aligned_by_9  - vp8_sad16x8x3_ssse3_do_jump
    294         dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump
    295         dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump
    296         dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump
    297         dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump
    298         dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump
    299         dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump
    300 vp8_sad16x8x3_ssse3_skiptable:
    301 
    302         call vp8_sad16x8x3_ssse3_do_jump
    303 vp8_sad16x8x3_ssse3_do_jump:
    304         pop             rcx                         ; get the address of do_jump
    305         mov             rax,  vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump
    306         add             rax,  rcx  ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
    307 
    308         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
    309         add             rcx,        rax
    310 
    311         movsxd          rax,        dword ptr arg(1) ;src_stride
    312         movsxd          rdx,        dword ptr arg(3) ;ref_stride
    313 
    314         jmp             rcx
    315 
    316         PROCESS_16X8X3_OFFSET 0,  vp8_sad16x8x3_ssse3
    317         PROCESS_16X8X3_OFFSET 1,  vp8_sad16x8x3_ssse3
    318         PROCESS_16X8X3_OFFSET 2,  vp8_sad16x8x3_ssse3
    319         PROCESS_16X8X3_OFFSET 3,  vp8_sad16x8x3_ssse3
    320         PROCESS_16X8X3_OFFSET 4,  vp8_sad16x8x3_ssse3
    321         PROCESS_16X8X3_OFFSET 5,  vp8_sad16x8x3_ssse3
    322         PROCESS_16X8X3_OFFSET 6,  vp8_sad16x8x3_ssse3
    323         PROCESS_16X8X3_OFFSET 7,  vp8_sad16x8x3_ssse3
    324         PROCESS_16X8X3_OFFSET 8,  vp8_sad16x8x3_ssse3
    325         PROCESS_16X8X3_OFFSET 9,  vp8_sad16x8x3_ssse3
    326         PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3
    327         PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3
    328         PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3
    329         PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3
    330         PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3
    331 
    332 vp8_sad16x8x3_ssse3_aligned_by_15:
    333 
    334         PROCESS_16X2X3 1
    335         PROCESS_16X2X3 0
    336         PROCESS_16X2X3 0
    337         PROCESS_16X2X3 0
    338 
    339 vp8_sad16x8x3_ssse3_store_off:
    340         mov             rdi,        arg(4) ;Results
    341 
    342         movq            xmm0,       xmm5
    343         psrldq          xmm5,       8
    344 
    345         paddw           xmm0,       xmm5
    346         movd            [rdi],      xmm0
    347 ;-
    348         movq            xmm0,       xmm6
    349         psrldq          xmm6,       8
    350 
    351         paddw           xmm0,       xmm6
    352         movd            [rdi+4],    xmm0
    353 ;-
    354         movq            xmm0,       xmm7
    355         psrldq          xmm7,       8
    356 
    357         paddw           xmm0,       xmm7
    358         movd            [rdi+8],    xmm0
    359 
    360     ; begin epilog
    361     pop         rcx
    362     pop         rdi
    363     pop         rsi
    364     UNSHADOW_ARGS
    365     pop         rbp
    366     ret
    367