Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ;void idct_dequant_0_2x_sse2
     15 ; (
     16 ;   short *qcoeff       - 0
     17 ;   short *dequant      - 1
     18 ;   unsigned char *pre  - 2
     19 ;   unsigned char *dst  - 3
     20 ;   int dst_stride      - 4
     21 ;   int blk_stride      - 5
     22 ; )
     23 
     24 global sym(idct_dequant_0_2x_sse2)
     25 sym(idct_dequant_0_2x_sse2):
     26     push        rbp
     27     mov         rbp, rsp
     28     SHADOW_ARGS_TO_STACK 6
     29     GET_GOT     rbx
     30     ; end prolog
     31 
     32         mov         rdx,            arg(1) ; dequant
     33         mov         rax,            arg(0) ; qcoeff
     34 
     35     ; Zero out xmm7, for use unpacking
     36         pxor        xmm7,           xmm7
     37 
     38         movd        xmm4,           [rax]
     39         movd        xmm5,           [rdx]
     40 
     41         pinsrw      xmm4,           [rax+32],   4
     42         pinsrw      xmm5,           [rdx],      4
     43 
     44         pmullw      xmm4,           xmm5
     45 
     46     ; clear coeffs
     47         movd        [rax],          xmm7
     48         movd        [rax+32],       xmm7
     49 ;pshufb
     50         pshuflw     xmm4,           xmm4,       00000000b
     51         pshufhw     xmm4,           xmm4,       00000000b
     52 
     53         mov         rax,            arg(2) ; pre
     54         paddw       xmm4,           [fours GLOBAL]
     55 
     56         movsxd      rcx,            dword ptr arg(5) ; blk_stride
     57         psraw       xmm4,           3
     58 
     59         movq        xmm0,           [rax]
     60         movq        xmm1,           [rax+rcx]
     61         movq        xmm2,           [rax+2*rcx]
     62         lea         rcx,            [3*rcx]
     63         movq        xmm3,           [rax+rcx]
     64 
     65         punpcklbw   xmm0,           xmm7
     66         punpcklbw   xmm1,           xmm7
     67         punpcklbw   xmm2,           xmm7
     68         punpcklbw   xmm3,           xmm7
     69 
     70         mov         rax,            arg(3) ; dst
     71         movsxd      rdx,            dword ptr arg(4) ; dst_stride
     72 
     73     ; Add to predict buffer
     74         paddw       xmm0,           xmm4
     75         paddw       xmm1,           xmm4
     76         paddw       xmm2,           xmm4
     77         paddw       xmm3,           xmm4
     78 
     79     ; pack up before storing
     80         packuswb    xmm0,           xmm7
     81         packuswb    xmm1,           xmm7
     82         packuswb    xmm2,           xmm7
     83         packuswb    xmm3,           xmm7
     84 
     85     ; store blocks back out
     86         movq        [rax],          xmm0
     87         movq        [rax + rdx],    xmm1
     88 
     89         lea         rax,            [rax + 2*rdx]
     90 
     91         movq        [rax],          xmm2
     92         movq        [rax + rdx],    xmm3
     93 
     94     ; begin epilog
     95     RESTORE_GOT
     96     UNSHADOW_ARGS
     97     pop         rbp
     98     ret
     99 
    100 global sym(idct_dequant_full_2x_sse2)
    101 sym(idct_dequant_full_2x_sse2):
    102     push        rbp
    103     mov         rbp, rsp
    104     SHADOW_ARGS_TO_STACK 7
    105     GET_GOT     rbx
    106     push        rsi
    107     push        rdi
    108     ; end prolog
    109 
    110     ; special case when 2 blocks have 0 or 1 coeffs
    111     ; dc is set as first coeff, so no need to load qcoeff
    112         mov         rax,            arg(0) ; qcoeff
    113         mov         rsi,            arg(2) ; pre
    114         mov         rdi,            arg(3) ; dst
    115         movsxd      rcx,            dword ptr arg(5) ; blk_stride
    116 
    117     ; Zero out xmm7, for use unpacking
    118         pxor        xmm7,           xmm7
    119 
    120         mov         rdx,            arg(1)  ; dequant
    121 
    122     ; note the transpose of xmm1 and xmm2, necessary for shuffle
    123     ;   to spit out sensicle data
    124         movdqa      xmm0,           [rax]
    125         movdqa      xmm2,           [rax+16]
    126         movdqa      xmm1,           [rax+32]
    127         movdqa      xmm3,           [rax+48]
    128 
    129     ; Clear out coeffs
    130         movdqa      [rax],          xmm7
    131         movdqa      [rax+16],       xmm7
    132         movdqa      [rax+32],       xmm7
    133         movdqa      [rax+48],       xmm7
    134 
    135     ; dequantize qcoeff buffer
    136         pmullw      xmm0,           [rdx]
    137         pmullw      xmm2,           [rdx+16]
    138         pmullw      xmm1,           [rdx]
    139         pmullw      xmm3,           [rdx+16]
    140 
    141     ; repack so block 0 row x and block 1 row x are together
    142         movdqa      xmm4,           xmm0
    143         punpckldq   xmm0,           xmm1
    144         punpckhdq   xmm4,           xmm1
    145 
    146         pshufd      xmm0,           xmm0,       11011000b
    147         pshufd      xmm1,           xmm4,       11011000b
    148 
    149         movdqa      xmm4,           xmm2
    150         punpckldq   xmm2,           xmm3
    151         punpckhdq   xmm4,           xmm3
    152 
    153         pshufd      xmm2,           xmm2,       11011000b
    154         pshufd      xmm3,           xmm4,       11011000b
    155 
    156     ; first pass
    157         psubw       xmm0,           xmm2        ; b1 = 0-2
    158         paddw       xmm2,           xmm2        ;
    159 
    160         movdqa      xmm5,           xmm1
    161         paddw       xmm2,           xmm0        ; a1 = 0+2
    162 
    163         pmulhw      xmm5,           [x_s1sqr2 GLOBAL]
    164         paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
    165 
    166         movdqa      xmm7,           xmm3
    167         pmulhw      xmm7,           [x_c1sqr2less1 GLOBAL]
    168 
    169         paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
    170         psubw       xmm7,           xmm5        ; c1
    171 
    172         movdqa      xmm5,           xmm1
    173         movdqa      xmm4,           xmm3
    174 
    175         pmulhw      xmm5,           [x_c1sqr2less1 GLOBAL]
    176         paddw       xmm5,           xmm1
    177 
    178         pmulhw      xmm3,           [x_s1sqr2 GLOBAL]
    179         paddw       xmm3,           xmm4
    180 
    181         paddw       xmm3,           xmm5        ; d1
    182         movdqa      xmm6,           xmm2        ; a1
    183 
    184         movdqa      xmm4,           xmm0        ; b1
    185         paddw       xmm2,           xmm3        ;0
    186 
    187         paddw       xmm4,           xmm7        ;1
    188         psubw       xmm0,           xmm7        ;2
    189 
    190         psubw       xmm6,           xmm3        ;3
    191 
    192     ; transpose for the second pass
    193         movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
    194         punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
    195         punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
    196 
    197         movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
    198         punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
    199         punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
    200 
    201 
    202         movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
    203         punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
    204         punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
    205 
    206         movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
    207         punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
    208         punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
    209 
    210 
    211         movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
    212         punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
    213         punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
    214 
    215         movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
    216         punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
    217         punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
    218 
    219         pshufd      xmm0,           xmm2,       11011000b
    220         pshufd      xmm2,           xmm1,       11011000b
    221 
    222         pshufd      xmm1,           xmm5,       11011000b
    223         pshufd      xmm3,           xmm7,       11011000b
    224 
    225     ; second pass
    226         psubw       xmm0,           xmm2            ; b1 = 0-2
    227         paddw       xmm2,           xmm2
    228 
    229         movdqa      xmm5,           xmm1
    230         paddw       xmm2,           xmm0            ; a1 = 0+2
    231 
    232         pmulhw      xmm5,           [x_s1sqr2 GLOBAL]
    233         paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
    234 
    235         movdqa      xmm7,           xmm3
    236         pmulhw      xmm7,           [x_c1sqr2less1 GLOBAL]
    237 
    238         paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
    239         psubw       xmm7,           xmm5            ; c1
    240 
    241         movdqa      xmm5,           xmm1
    242         movdqa      xmm4,           xmm3
    243 
    244         pmulhw      xmm5,           [x_c1sqr2less1 GLOBAL]
    245         paddw       xmm5,           xmm1
    246 
    247         pmulhw      xmm3,           [x_s1sqr2 GLOBAL]
    248         paddw       xmm3,           xmm4
    249 
    250         paddw       xmm3,           xmm5            ; d1
    251         paddw       xmm0,           [fours GLOBAL]
    252 
    253         paddw       xmm2,           [fours GLOBAL]
    254         movdqa      xmm6,           xmm2            ; a1
    255 
    256         movdqa      xmm4,           xmm0            ; b1
    257         paddw       xmm2,           xmm3            ;0
    258 
    259         paddw       xmm4,           xmm7            ;1
    260         psubw       xmm0,           xmm7            ;2
    261 
    262         psubw       xmm6,           xmm3            ;3
    263         psraw       xmm2,           3
    264 
    265         psraw       xmm0,           3
    266         psraw       xmm4,           3
    267 
    268         psraw       xmm6,           3
    269 
    270     ; transpose to save
    271         movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
    272         punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
    273         punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
    274 
    275         movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
    276         punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
    277         punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
    278 
    279 
    280         movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
    281         punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
    282         punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
    283 
    284         movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
    285         punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
    286         punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
    287 
    288 
    289         movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
    290         punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
    291         punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
    292 
    293         movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
    294         punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
    295         punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
    296 
    297         pshufd      xmm0,           xmm2,       11011000b
    298         pshufd      xmm2,           xmm1,       11011000b
    299 
    300         pshufd      xmm1,           xmm5,       11011000b
    301         pshufd      xmm3,           xmm7,       11011000b
    302 
    303         pxor        xmm7,           xmm7
    304 
    305     ; Load up predict blocks
    306         movq        xmm4,           [rsi]
    307         movq        xmm5,           [rsi+rcx]
    308 
    309         punpcklbw   xmm4,           xmm7
    310         punpcklbw   xmm5,           xmm7
    311 
    312         paddw       xmm0,           xmm4
    313         paddw       xmm1,           xmm5
    314 
    315         movq        xmm4,           [rsi+2*rcx]
    316         lea         rcx,            [3*rcx]
    317         movq        xmm5,           [rsi+rcx]
    318 
    319         punpcklbw   xmm4,           xmm7
    320         punpcklbw   xmm5,           xmm7
    321 
    322         paddw       xmm2,           xmm4
    323         paddw       xmm3,           xmm5
    324 
    325 .finish:
    326 
    327     ; pack up before storing
    328         packuswb    xmm0,           xmm7
    329         packuswb    xmm1,           xmm7
    330         packuswb    xmm2,           xmm7
    331         packuswb    xmm3,           xmm7
    332 
    333     ; Load destination stride before writing out,
    334     ;   doesn't need to persist
    335         movsxd      rdx,            dword ptr arg(4) ; dst_stride
    336 
    337     ; store blocks back out
    338         movq        [rdi],          xmm0
    339         movq        [rdi + rdx],    xmm1
    340 
    341         lea         rdi,            [rdi + 2*rdx]
    342 
    343         movq        [rdi],          xmm2
    344         movq        [rdi + rdx],    xmm3
    345 
    346     ; begin epilog
    347     pop         rdi
    348     pop         rsi
    349     RESTORE_GOT
    350     UNSHADOW_ARGS
    351     pop         rbp
    352     ret
    353 
    354 ;void idct_dequant_dc_0_2x_sse2
    355 ; (
    356 ;   short *qcoeff       - 0
    357 ;   short *dequant      - 1
    358 ;   unsigned char *pre  - 2
    359 ;   unsigned char *dst  - 3
    360 ;   int dst_stride      - 4
    361 ;   short *dc           - 5
    362 ; )
    363 global sym(idct_dequant_dc_0_2x_sse2)
    364 sym(idct_dequant_dc_0_2x_sse2):
    365     push        rbp
    366     mov         rbp, rsp
    367     SHADOW_ARGS_TO_STACK 7
    368     GET_GOT     rbx
    369     push        rsi
    370     push        rdi
    371     ; end prolog
    372 
    373     ; special case when 2 blocks have 0 or 1 coeffs
    374     ; dc is set as first coeff, so no need to load qcoeff
    375         mov         rax,            arg(0) ; qcoeff
    376         mov         rsi,            arg(2) ; pre
    377         mov         rdi,            arg(3) ; dst
    378         mov         rdx,            arg(5) ; dc
    379 
    380     ; Zero out xmm7, for use unpacking
    381         pxor        xmm7,           xmm7
    382 
    383     ; load up 2 dc words here == 2*16 = doubleword
    384         movd        xmm4,           [rdx]
    385 
    386     ; Load up predict blocks
    387         movq        xmm0,           [rsi]
    388         movq        xmm1,           [rsi+16]
    389         movq        xmm2,           [rsi+32]
    390         movq        xmm3,           [rsi+48]
    391 
    392     ; Duplicate and expand dc across
    393         punpcklwd   xmm4,           xmm4
    394         punpckldq   xmm4,           xmm4
    395 
    396     ; Rounding to dequant and downshift
    397         paddw       xmm4,           [fours GLOBAL]
    398         psraw       xmm4,           3
    399 
    400     ; Predict buffer needs to be expanded from bytes to words
    401         punpcklbw   xmm0,           xmm7
    402         punpcklbw   xmm1,           xmm7
    403         punpcklbw   xmm2,           xmm7
    404         punpcklbw   xmm3,           xmm7
    405 
    406     ; Add to predict buffer
    407         paddw       xmm0,           xmm4
    408         paddw       xmm1,           xmm4
    409         paddw       xmm2,           xmm4
    410         paddw       xmm3,           xmm4
    411 
    412     ; pack up before storing
    413         packuswb    xmm0,           xmm7
    414         packuswb    xmm1,           xmm7
    415         packuswb    xmm2,           xmm7
    416         packuswb    xmm3,           xmm7
    417 
    418     ; Load destination stride before writing out,
    419     ;   doesn't need to persist
    420         movsxd      rdx,            dword ptr arg(4) ; dst_stride
    421 
    422     ; store blocks back out
    423         movq        [rdi],          xmm0
    424         movq        [rdi + rdx],    xmm1
    425 
    426         lea         rdi,            [rdi + 2*rdx]
    427 
    428         movq        [rdi],          xmm2
    429         movq        [rdi + rdx],    xmm3
    430 
    431     ; begin epilog
    432     pop         rdi
    433     pop         rsi
    434     RESTORE_GOT
    435     UNSHADOW_ARGS
    436     pop         rbp
    437     ret
    438 
    439 global sym(idct_dequant_dc_full_2x_sse2)
    440 sym(idct_dequant_dc_full_2x_sse2):
    441     push        rbp
    442     mov         rbp, rsp
    443     SHADOW_ARGS_TO_STACK 7
    444     GET_GOT     rbx
    445     push        rsi
    446     push        rdi
    447     ; end prolog
    448 
    449     ; special case when 2 blocks have 0 or 1 coeffs
    450     ; dc is set as first coeff, so no need to load qcoeff
    451         mov         rax,            arg(0) ; qcoeff
    452         mov         rsi,            arg(2) ; pre
    453         mov         rdi,            arg(3) ; dst
    454 
    455     ; Zero out xmm7, for use unpacking
    456         pxor        xmm7,           xmm7
    457 
    458         mov         rdx,            arg(1)  ; dequant
    459 
    460     ; note the transpose of xmm1 and xmm2, necessary for shuffle
    461     ;   to spit out sensicle data
    462         movdqa      xmm0,           [rax]
    463         movdqa      xmm2,           [rax+16]
    464         movdqa      xmm1,           [rax+32]
    465         movdqa      xmm3,           [rax+48]
    466 
    467     ; Clear out coeffs
    468         movdqa      [rax],          xmm7
    469         movdqa      [rax+16],       xmm7
    470         movdqa      [rax+32],       xmm7
    471         movdqa      [rax+48],       xmm7
    472 
    473     ; dequantize qcoeff buffer
    474         pmullw      xmm0,           [rdx]
    475         pmullw      xmm2,           [rdx+16]
    476         pmullw      xmm1,           [rdx]
    477         pmullw      xmm3,           [rdx+16]
    478 
    479     ; DC component
    480         mov         rdx,            arg(5)
    481 
    482     ; repack so block 0 row x and block 1 row x are together
    483         movdqa      xmm4,           xmm0
    484         punpckldq   xmm0,           xmm1
    485         punpckhdq   xmm4,           xmm1
    486 
    487         pshufd      xmm0,           xmm0,       11011000b
    488         pshufd      xmm1,           xmm4,       11011000b
    489 
    490         movdqa      xmm4,           xmm2
    491         punpckldq   xmm2,           xmm3
    492         punpckhdq   xmm4,           xmm3
    493 
    494         pshufd      xmm2,           xmm2,       11011000b
    495         pshufd      xmm3,           xmm4,       11011000b
    496 
    497     ; insert DC component
    498         pinsrw      xmm0,           [rdx],      0
    499         pinsrw      xmm0,           [rdx+2],    4
    500 
    501     ; first pass
    502         psubw       xmm0,           xmm2        ; b1 = 0-2
    503         paddw       xmm2,           xmm2        ;
    504 
    505         movdqa      xmm5,           xmm1
    506         paddw       xmm2,           xmm0        ; a1 = 0+2
    507 
    508         pmulhw      xmm5,           [x_s1sqr2 GLOBAL]
    509         paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
    510 
    511         movdqa      xmm7,           xmm3
    512         pmulhw      xmm7,           [x_c1sqr2less1 GLOBAL]
    513 
    514         paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
    515         psubw       xmm7,           xmm5        ; c1
    516 
    517         movdqa      xmm5,           xmm1
    518         movdqa      xmm4,           xmm3
    519 
    520         pmulhw      xmm5,           [x_c1sqr2less1 GLOBAL]
    521         paddw       xmm5,           xmm1
    522 
    523         pmulhw      xmm3,           [x_s1sqr2 GLOBAL]
    524         paddw       xmm3,           xmm4
    525 
    526         paddw       xmm3,           xmm5        ; d1
    527         movdqa      xmm6,           xmm2        ; a1
    528 
    529         movdqa      xmm4,           xmm0        ; b1
    530         paddw       xmm2,           xmm3        ;0
    531 
    532         paddw       xmm4,           xmm7        ;1
    533         psubw       xmm0,           xmm7        ;2
    534 
    535         psubw       xmm6,           xmm3        ;3
    536 
    537     ; transpose for the second pass
    538         movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
    539         punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
    540         punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
    541 
    542         movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
    543         punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
    544         punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
    545 
    546 
    547         movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
    548         punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
    549         punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
    550 
    551         movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
    552         punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
    553         punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
    554 
    555 
    556         movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
    557         punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
    558         punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
    559 
    560         movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
    561         punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
    562         punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
    563 
    564         pshufd      xmm0,           xmm2,       11011000b
    565         pshufd      xmm2,           xmm1,       11011000b
    566 
    567         pshufd      xmm1,           xmm5,       11011000b
    568         pshufd      xmm3,           xmm7,       11011000b
    569 
    570     ; second pass
    571         psubw       xmm0,           xmm2            ; b1 = 0-2
    572         paddw       xmm2,           xmm2
    573 
    574         movdqa      xmm5,           xmm1
    575         paddw       xmm2,           xmm0            ; a1 = 0+2
    576 
    577         pmulhw      xmm5,           [x_s1sqr2 GLOBAL]
    578         paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
    579 
    580         movdqa      xmm7,           xmm3
    581         pmulhw      xmm7,           [x_c1sqr2less1 GLOBAL]
    582 
    583         paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
    584         psubw       xmm7,           xmm5            ; c1
    585 
    586         movdqa      xmm5,           xmm1
    587         movdqa      xmm4,           xmm3
    588 
    589         pmulhw      xmm5,           [x_c1sqr2less1 GLOBAL]
    590         paddw       xmm5,           xmm1
    591 
    592         pmulhw      xmm3,           [x_s1sqr2 GLOBAL]
    593         paddw       xmm3,           xmm4
    594 
    595         paddw       xmm3,           xmm5            ; d1
    596         paddw       xmm0,           [fours GLOBAL]
    597 
    598         paddw       xmm2,           [fours GLOBAL]
    599         movdqa      xmm6,           xmm2            ; a1
    600 
    601         movdqa      xmm4,           xmm0            ; b1
    602         paddw       xmm2,           xmm3            ;0
    603 
    604         paddw       xmm4,           xmm7            ;1
    605         psubw       xmm0,           xmm7            ;2
    606 
    607         psubw       xmm6,           xmm3            ;3
    608         psraw       xmm2,           3
    609 
    610         psraw       xmm0,           3
    611         psraw       xmm4,           3
    612 
    613         psraw       xmm6,           3
    614 
    615     ; transpose to save
    616         movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
    617         punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
    618         punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
    619 
    620         movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
    621         punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
    622         punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
    623 
    624 
    625         movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
    626         punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
    627         punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
    628 
    629         movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
    630         punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
    631         punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
    632 
    633 
    634         movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
    635         punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
    636         punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
    637 
    638         movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
    639         punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
    640         punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
    641 
    642         pshufd      xmm0,           xmm2,       11011000b
    643         pshufd      xmm2,           xmm1,       11011000b
    644 
    645         pshufd      xmm1,           xmm5,       11011000b
    646         pshufd      xmm3,           xmm7,       11011000b
    647 
    648         pxor        xmm7,           xmm7
    649 
    650     ; Load up predict blocks
    651         movq        xmm4,           [rsi]
    652         movq        xmm5,           [rsi+16]
    653 
    654         punpcklbw   xmm4,           xmm7
    655         punpcklbw   xmm5,           xmm7
    656 
    657         paddw       xmm0,           xmm4
    658         paddw       xmm1,           xmm5
    659 
    660         movq        xmm4,           [rsi+32]
    661         movq        xmm5,           [rsi+48]
    662 
    663         punpcklbw   xmm4,           xmm7
    664         punpcklbw   xmm5,           xmm7
    665 
    666         paddw       xmm2,           xmm4
    667         paddw       xmm3,           xmm5
    668 
    669 .finish:
    670 
    671     ; pack up before storing
    672         packuswb    xmm0,           xmm7
    673         packuswb    xmm1,           xmm7
    674         packuswb    xmm2,           xmm7
    675         packuswb    xmm3,           xmm7
    676 
    677     ; Load destination stride before writing out,
    678     ;   doesn't need to persist
    679         movsxd      rdx,            dword ptr arg(4) ; dst_stride
    680 
    681     ; store blocks back out
    682         movq        [rdi],          xmm0
    683         movq        [rdi + rdx],    xmm1
    684 
    685         lea         rdi,            [rdi + 2*rdx]
    686 
    687         movq        [rdi],          xmm2
    688         movq        [rdi + rdx],    xmm3
    689 
    690 
    691     ; begin epilog
    692     pop         rdi
    693     pop         rsi
    694     RESTORE_GOT
    695     UNSHADOW_ARGS
    696     pop         rbp
    697     ret
    698 
    699 SECTION_RODATA
    700 align 16
    701 fours:
    702     times 8 dw 0x0004
    703 align 16
    704 x_s1sqr2:
    705     times 8 dw 0x8A8C
    706 align 16
    707 x_c1sqr2less1:
    708     times 8 dw 0x4E7B
    709