Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ;void vp8_idct_dequant_0_2x_sse2
     15 ; (
     16 ;   short *qcoeff       - 0
     17 ;   short *dequant      - 1
     18 ;   unsigned char *dst  - 2
     19 ;   int dst_stride      - 3
     20 ; )
     21 
     22 SECTION .text
     23 
     24 global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
     25 sym(vp8_idct_dequant_0_2x_sse2):
     26     push        rbp
     27     mov         rbp, rsp
     28     SHADOW_ARGS_TO_STACK 4
     29     GET_GOT     rbx
     30     ; end prolog
     31 
     32         mov         rdx,            arg(1) ; dequant
     33         mov         rax,            arg(0) ; qcoeff
     34 
     35         movd        xmm4,           [rax]
     36         movd        xmm5,           [rdx]
     37 
     38         pinsrw      xmm4,           [rax+32],   4
     39         pinsrw      xmm5,           [rdx],      4
     40 
     41         pmullw      xmm4,           xmm5
     42 
     43     ; Zero out xmm5, for use unpacking
     44         pxor        xmm5,           xmm5
     45 
     46     ; clear coeffs
     47         movd        [rax],          xmm5
     48         movd        [rax+32],       xmm5
     49 ;pshufb
     50         mov         rax,            arg(2) ; dst
     51         movsxd      rdx,            dword ptr arg(3) ; dst_stride
     52 
     53         pshuflw     xmm4,           xmm4,       00000000b
     54         pshufhw     xmm4,           xmm4,       00000000b
     55 
     56         lea         rcx,            [rdx + rdx*2]
     57         paddw       xmm4,           [GLOBAL(fours)]
     58 
     59         psraw       xmm4,           3
     60 
     61         movq        xmm0,           [rax]
     62         movq        xmm1,           [rax+rdx]
     63         movq        xmm2,           [rax+2*rdx]
     64         movq        xmm3,           [rax+rcx]
     65 
     66         punpcklbw   xmm0,           xmm5
     67         punpcklbw   xmm1,           xmm5
     68         punpcklbw   xmm2,           xmm5
     69         punpcklbw   xmm3,           xmm5
     70 
     71 
     72     ; Add to predict buffer
     73         paddw       xmm0,           xmm4
     74         paddw       xmm1,           xmm4
     75         paddw       xmm2,           xmm4
     76         paddw       xmm3,           xmm4
     77 
     78     ; pack up before storing
     79         packuswb    xmm0,           xmm5
     80         packuswb    xmm1,           xmm5
     81         packuswb    xmm2,           xmm5
     82         packuswb    xmm3,           xmm5
     83 
     84     ; store blocks back out
     85         movq        [rax],          xmm0
     86         movq        [rax + rdx],    xmm1
     87 
     88         lea         rax,            [rax + 2*rdx]
     89 
     90         movq        [rax],          xmm2
     91         movq        [rax + rdx],    xmm3
     92 
     93     ; begin epilog
     94     RESTORE_GOT
     95     UNSHADOW_ARGS
     96     pop         rbp
     97     ret
     98 
     99 ;void vp8_idct_dequant_full_2x_sse2
    100 ; (
    101 ;   short *qcoeff       - 0
    102 ;   short *dequant      - 1
    103 ;   unsigned char *dst  - 2
    104 ;   int dst_stride      - 3
    105 ; )
    106 global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
    107 sym(vp8_idct_dequant_full_2x_sse2):
    108     push        rbp
    109     mov         rbp, rsp
    110     SHADOW_ARGS_TO_STACK 4
    111     SAVE_XMM 7
    112     GET_GOT     rbx
    113     push        rsi
    114     push        rdi
    115     ; end prolog
    116 
    117     ; special case when 2 blocks have 0 or 1 coeffs
    118     ; dc is set as first coeff, so no need to load qcoeff
    119         mov         rax,            arg(0) ; qcoeff
    120         mov         rdx,            arg(1)  ; dequant
    121         mov         rdi,            arg(2) ; dst
    122 
    123 
    124     ; Zero out xmm7, for use unpacking
    125         pxor        xmm7,           xmm7
    126 
    127 
    128     ; note the transpose of xmm1 and xmm2, necessary for shuffle
    129     ;   to spit out sensicle data
    130         movdqa      xmm0,           [rax]
    131         movdqa      xmm2,           [rax+16]
    132         movdqa      xmm1,           [rax+32]
    133         movdqa      xmm3,           [rax+48]
    134 
    135     ; Clear out coeffs
    136         movdqa      [rax],          xmm7
    137         movdqa      [rax+16],       xmm7
    138         movdqa      [rax+32],       xmm7
    139         movdqa      [rax+48],       xmm7
    140 
    141     ; dequantize qcoeff buffer
    142         pmullw      xmm0,           [rdx]
    143         pmullw      xmm2,           [rdx+16]
    144         pmullw      xmm1,           [rdx]
    145         pmullw      xmm3,           [rdx+16]
    146         movsxd      rdx,            dword ptr arg(3) ; dst_stride
    147 
    148     ; repack so block 0 row x and block 1 row x are together
    149         movdqa      xmm4,           xmm0
    150         punpckldq   xmm0,           xmm1
    151         punpckhdq   xmm4,           xmm1
    152 
    153         pshufd      xmm0,           xmm0,       11011000b
    154         pshufd      xmm1,           xmm4,       11011000b
    155 
    156         movdqa      xmm4,           xmm2
    157         punpckldq   xmm2,           xmm3
    158         punpckhdq   xmm4,           xmm3
    159 
    160         pshufd      xmm2,           xmm2,       11011000b
    161         pshufd      xmm3,           xmm4,       11011000b
    162 
    163     ; first pass
    164         psubw       xmm0,           xmm2        ; b1 = 0-2
    165         paddw       xmm2,           xmm2        ;
    166 
    167         movdqa      xmm5,           xmm1
    168         paddw       xmm2,           xmm0        ; a1 = 0+2
    169 
    170         pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
    171         lea         rcx,            [rdx + rdx*2]   ;dst_stride * 3
    172         paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
    173 
    174         movdqa      xmm7,           xmm3
    175         pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
    176 
    177         paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
    178         psubw       xmm7,           xmm5        ; c1
    179 
    180         movdqa      xmm5,           xmm1
    181         movdqa      xmm4,           xmm3
    182 
    183         pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
    184         paddw       xmm5,           xmm1
    185 
    186         pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
    187         paddw       xmm3,           xmm4
    188 
    189         paddw       xmm3,           xmm5        ; d1
    190         movdqa      xmm6,           xmm2        ; a1
    191 
    192         movdqa      xmm4,           xmm0        ; b1
    193         paddw       xmm2,           xmm3        ;0
    194 
    195         paddw       xmm4,           xmm7        ;1
    196         psubw       xmm0,           xmm7        ;2
    197 
    198         psubw       xmm6,           xmm3        ;3
    199 
    200     ; transpose for the second pass
    201         movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
    202         punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
    203         punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
    204 
    205         movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
    206         punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
    207         punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
    208 
    209 
    210         movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
    211         punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
    212         punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
    213 
    214         movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
    215         punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
    216         punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
    217 
    218 
    219         movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
    220         punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
    221         punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
    222 
    223         movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
    224         punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
    225         punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
    226 
    227         pshufd      xmm0,           xmm2,       11011000b
    228         pshufd      xmm2,           xmm1,       11011000b
    229 
    230         pshufd      xmm1,           xmm5,       11011000b
    231         pshufd      xmm3,           xmm7,       11011000b
    232 
    233     ; second pass
    234         psubw       xmm0,           xmm2            ; b1 = 0-2
    235         paddw       xmm2,           xmm2
    236 
    237         movdqa      xmm5,           xmm1
    238         paddw       xmm2,           xmm0            ; a1 = 0+2
    239 
    240         pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
    241         paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
    242 
    243         movdqa      xmm7,           xmm3
    244         pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
    245 
    246         paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
    247         psubw       xmm7,           xmm5            ; c1
    248 
    249         movdqa      xmm5,           xmm1
    250         movdqa      xmm4,           xmm3
    251 
    252         pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
    253         paddw       xmm5,           xmm1
    254 
    255         pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
    256         paddw       xmm3,           xmm4
    257 
    258         paddw       xmm3,           xmm5            ; d1
    259         paddw       xmm0,           [GLOBAL(fours)]
    260 
    261         paddw       xmm2,           [GLOBAL(fours)]
    262         movdqa      xmm6,           xmm2            ; a1
    263 
    264         movdqa      xmm4,           xmm0            ; b1
    265         paddw       xmm2,           xmm3            ;0
    266 
    267         paddw       xmm4,           xmm7            ;1
    268         psubw       xmm0,           xmm7            ;2
    269 
    270         psubw       xmm6,           xmm3            ;3
    271         psraw       xmm2,           3
    272 
    273         psraw       xmm0,           3
    274         psraw       xmm4,           3
    275 
    276         psraw       xmm6,           3
    277 
    278     ; transpose to save
    279         movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
    280         punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
    281         punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
    282 
    283         movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
    284         punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
    285         punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
    286 
    287 
    288         movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
    289         punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
    290         punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
    291 
    292         movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
    293         punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
    294         punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
    295 
    296 
    297         movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
    298         punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
    299         punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
    300 
    301         movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
    302         punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
    303         punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
    304 
    305         pshufd      xmm0,           xmm2,       11011000b
    306         pshufd      xmm2,           xmm1,       11011000b
    307 
    308         pshufd      xmm1,           xmm5,       11011000b
    309         pshufd      xmm3,           xmm7,       11011000b
    310 
    311         pxor        xmm7,           xmm7
    312 
    313     ; Load up predict blocks
    314         movq        xmm4,           [rdi]
    315         movq        xmm5,           [rdi+rdx]
    316 
    317         punpcklbw   xmm4,           xmm7
    318         punpcklbw   xmm5,           xmm7
    319 
    320         paddw       xmm0,           xmm4
    321         paddw       xmm1,           xmm5
    322 
    323         movq        xmm4,           [rdi+2*rdx]
    324         movq        xmm5,           [rdi+rcx]
    325 
    326         punpcklbw   xmm4,           xmm7
    327         punpcklbw   xmm5,           xmm7
    328 
    329         paddw       xmm2,           xmm4
    330         paddw       xmm3,           xmm5
    331 
    332 .finish:
    333 
    334     ; pack up before storing
    335         packuswb    xmm0,           xmm7
    336         packuswb    xmm1,           xmm7
    337         packuswb    xmm2,           xmm7
    338         packuswb    xmm3,           xmm7
    339 
    340     ; store blocks back out
    341         movq        [rdi],          xmm0
    342         movq        [rdi + rdx],    xmm1
    343         movq        [rdi + rdx*2],  xmm2
    344         movq        [rdi + rcx],    xmm3
    345 
    346     ; begin epilog
    347     pop         rdi
    348     pop         rsi
    349     RESTORE_GOT
    350     RESTORE_XMM
    351     UNSHADOW_ARGS
    352     pop         rbp
    353     ret
    354 
    355 ;void vp8_idct_dequant_dc_0_2x_sse2
    356 ; (
    357 ;   short *qcoeff       - 0
    358 ;   short *dequant      - 1
    359 ;   unsigned char *dst  - 2
    360 ;   int dst_stride      - 3
    361 ;   short *dc           - 4
    362 ; )
    363 global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
    364 sym(vp8_idct_dequant_dc_0_2x_sse2):
    365     push        rbp
    366     mov         rbp, rsp
    367     SHADOW_ARGS_TO_STACK 5
    368     GET_GOT     rbx
    369     push        rdi
    370     ; end prolog
    371 
    372     ; special case when 2 blocks have 0 or 1 coeffs
    373     ; dc is set as first coeff, so no need to load qcoeff
    374         mov         rax,            arg(0) ; qcoeff
    375 
    376         mov         rdi,            arg(2) ; dst
    377         mov         rdx,            arg(4) ; dc
    378 
    379     ; Zero out xmm5, for use unpacking
    380         pxor        xmm5,           xmm5
    381 
    382     ; load up 2 dc words here == 2*16 = doubleword
    383         movd        xmm4,           [rdx]
    384 
    385         movsxd      rdx,            dword ptr arg(3) ; dst_stride
    386         lea         rcx, [rdx + rdx*2]
    387     ; Load up predict blocks
    388         movq        xmm0,           [rdi]
    389         movq        xmm1,           [rdi+rdx*1]
    390         movq        xmm2,           [rdi+rdx*2]
    391         movq        xmm3,           [rdi+rcx]
    392 
    393     ; Duplicate and expand dc across
    394         punpcklwd   xmm4,           xmm4
    395         punpckldq   xmm4,           xmm4
    396 
    397     ; Rounding to dequant and downshift
    398         paddw       xmm4,           [GLOBAL(fours)]
    399         psraw       xmm4,           3
    400 
    401     ; Predict buffer needs to be expanded from bytes to words
    402         punpcklbw   xmm0,           xmm5
    403         punpcklbw   xmm1,           xmm5
    404         punpcklbw   xmm2,           xmm5
    405         punpcklbw   xmm3,           xmm5
    406 
    407     ; Add to predict buffer
    408         paddw       xmm0,           xmm4
    409         paddw       xmm1,           xmm4
    410         paddw       xmm2,           xmm4
    411         paddw       xmm3,           xmm4
    412 
    413     ; pack up before storing
    414         packuswb    xmm0,           xmm5
    415         packuswb    xmm1,           xmm5
    416         packuswb    xmm2,           xmm5
    417         packuswb    xmm3,           xmm5
    418 
    419     ; store blocks back out
    420         movq        [rdi],          xmm0
    421         movq        [rdi + rdx],    xmm1
    422         movq        [rdi + rdx*2],  xmm2
    423         movq        [rdi + rcx],    xmm3
    424 
    425     ; begin epilog
    426     pop         rdi
    427     RESTORE_GOT
    428     UNSHADOW_ARGS
    429     pop         rbp
    430     ret
    431 ;void vp8_idct_dequant_dc_full_2x_sse2
    432 ; (
    433 ;   short *qcoeff       - 0
    434 ;   short *dequant      - 1
    435 ;   unsigned char *dst  - 2
    436 ;   int dst_stride      - 3
    437 ;   short *dc           - 4
    438 ; )
    439 global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
    440 sym(vp8_idct_dequant_dc_full_2x_sse2):
    441     push        rbp
    442     mov         rbp, rsp
    443     SHADOW_ARGS_TO_STACK 5
    444     SAVE_XMM 7
    445     GET_GOT     rbx
    446     push        rdi
    447     ; end prolog
    448 
    449     ; special case when 2 blocks have 0 or 1 coeffs
    450     ; dc is set as first coeff, so no need to load qcoeff
    451         mov         rax,            arg(0) ; qcoeff
    452         mov         rdx,            arg(1)  ; dequant
    453 
    454         mov         rdi,            arg(2) ; dst
    455 
    456     ; Zero out xmm7, for use unpacking
    457         pxor        xmm7,           xmm7
    458 
    459 
    460     ; note the transpose of xmm1 and xmm2, necessary for shuffle
    461     ;   to spit out sensicle data
    462         movdqa      xmm0,           [rax]
    463         movdqa      xmm2,           [rax+16]
    464         movdqa      xmm1,           [rax+32]
    465         movdqa      xmm3,           [rax+48]
    466 
    467     ; Clear out coeffs
    468         movdqa      [rax],          xmm7
    469         movdqa      [rax+16],       xmm7
    470         movdqa      [rax+32],       xmm7
    471         movdqa      [rax+48],       xmm7
    472 
    473     ; dequantize qcoeff buffer
    474         pmullw      xmm0,           [rdx]
    475         pmullw      xmm2,           [rdx+16]
    476         pmullw      xmm1,           [rdx]
    477         pmullw      xmm3,           [rdx+16]
    478 
    479     ; DC component
    480         mov         rdx,            arg(4)
    481 
    482     ; repack so block 0 row x and block 1 row x are together
    483         movdqa      xmm4,           xmm0
    484         punpckldq   xmm0,           xmm1
    485         punpckhdq   xmm4,           xmm1
    486 
    487         pshufd      xmm0,           xmm0,       11011000b
    488         pshufd      xmm1,           xmm4,       11011000b
    489 
    490         movdqa      xmm4,           xmm2
    491         punpckldq   xmm2,           xmm3
    492         punpckhdq   xmm4,           xmm3
    493 
    494         pshufd      xmm2,           xmm2,       11011000b
    495         pshufd      xmm3,           xmm4,       11011000b
    496 
    497     ; insert DC component
    498         pinsrw      xmm0,           [rdx],      0
    499         pinsrw      xmm0,           [rdx+2],    4
    500 
    501     ; first pass
    502         psubw       xmm0,           xmm2        ; b1 = 0-2
    503         paddw       xmm2,           xmm2        ;
    504 
    505         movdqa      xmm5,           xmm1
    506         paddw       xmm2,           xmm0        ; a1 = 0+2
    507 
    508         pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
    509         paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
    510 
    511         movdqa      xmm7,           xmm3
    512         pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
    513 
    514         paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
    515         psubw       xmm7,           xmm5        ; c1
    516 
    517         movdqa      xmm5,           xmm1
    518         movdqa      xmm4,           xmm3
    519 
    520         pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
    521         paddw       xmm5,           xmm1
    522 
    523         pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
    524         paddw       xmm3,           xmm4
    525 
    526         paddw       xmm3,           xmm5        ; d1
    527         movdqa      xmm6,           xmm2        ; a1
    528 
    529         movdqa      xmm4,           xmm0        ; b1
    530         paddw       xmm2,           xmm3        ;0
    531 
    532         paddw       xmm4,           xmm7        ;1
    533         psubw       xmm0,           xmm7        ;2
    534 
    535         psubw       xmm6,           xmm3        ;3
    536 
    537     ; transpose for the second pass
    538         movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
    539         punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
    540         punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
    541 
    542         movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
    543         punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
    544         punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
    545 
    546 
    547         movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
    548         punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
    549         punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
    550 
    551         movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
    552         punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
    553         punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
    554 
    555 
    556         movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
    557         punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
    558         punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
    559 
    560         movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
    561         punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
    562         punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
    563 
    564         pshufd      xmm0,           xmm2,       11011000b
    565         pshufd      xmm2,           xmm1,       11011000b
    566 
    567         pshufd      xmm1,           xmm5,       11011000b
    568         pshufd      xmm3,           xmm7,       11011000b
    569 
    570     ; second pass
    571         psubw       xmm0,           xmm2            ; b1 = 0-2
    572         paddw       xmm2,           xmm2
    573 
    574         movdqa      xmm5,           xmm1
    575         paddw       xmm2,           xmm0            ; a1 = 0+2
    576 
    577         pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
    578         paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
    579 
    580         movdqa      xmm7,           xmm3
    581         pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
    582 
    583         paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
    584         psubw       xmm7,           xmm5            ; c1
    585 
    586         movdqa      xmm5,           xmm1
    587         movdqa      xmm4,           xmm3
    588 
    589         pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
    590         paddw       xmm5,           xmm1
    591 
    592         pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
    593         paddw       xmm3,           xmm4
    594 
    595         paddw       xmm3,           xmm5            ; d1
    596         paddw       xmm0,           [GLOBAL(fours)]
    597 
    598         paddw       xmm2,           [GLOBAL(fours)]
    599         movdqa      xmm6,           xmm2            ; a1
    600 
    601         movdqa      xmm4,           xmm0            ; b1
    602         paddw       xmm2,           xmm3            ;0
    603 
    604         paddw       xmm4,           xmm7            ;1
    605         psubw       xmm0,           xmm7            ;2
    606 
    607         psubw       xmm6,           xmm3            ;3
    608         psraw       xmm2,           3
    609 
    610         psraw       xmm0,           3
    611         psraw       xmm4,           3
    612 
    613         psraw       xmm6,           3
    614 
    615     ; transpose to save
    616         movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
    617         punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
    618         punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
    619 
    620         movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
    621         punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
    622         punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
    623 
    624 
    625         movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
    626         punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
    627         punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
    628 
    629         movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
    630         punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
    631         punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
    632 
    633 
    634         movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
    635         punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
    636         punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
    637 
    638         movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
    639         punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
    640         punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
    641 
    642         pshufd      xmm0,           xmm2,       11011000b
    643         pshufd      xmm2,           xmm1,       11011000b
    644 
    645         pshufd      xmm1,           xmm5,       11011000b
    646         pshufd      xmm3,           xmm7,       11011000b
    647 
    648         pxor        xmm7,           xmm7
    649 
    650     ; Load up predict blocks
    651         movsxd      rdx,            dword ptr arg(3) ; dst_stride
    652         movq        xmm4,           [rdi]
    653         movq        xmm5,           [rdi+rdx]
    654         lea         rcx,            [rdx + rdx*2]
    655 
    656         punpcklbw   xmm4,           xmm7
    657         punpcklbw   xmm5,           xmm7
    658 
    659         paddw       xmm0,           xmm4
    660         paddw       xmm1,           xmm5
    661 
    662         movq        xmm4,           [rdi+rdx*2]
    663         movq        xmm5,           [rdi+rcx]
    664 
    665         punpcklbw   xmm4,           xmm7
    666         punpcklbw   xmm5,           xmm7
    667 
    668         paddw       xmm2,           xmm4
    669         paddw       xmm3,           xmm5
    670 
    671 .finish:
    672 
    673     ; pack up before storing
    674         packuswb    xmm0,           xmm7
    675         packuswb    xmm1,           xmm7
    676         packuswb    xmm2,           xmm7
    677         packuswb    xmm3,           xmm7
    678 
    679     ; Load destination stride before writing out,
    680     ;   doesn't need to persist
    681         movsxd      rdx,            dword ptr arg(3) ; dst_stride
    682 
    683     ; store blocks back out
    684         movq        [rdi],          xmm0
    685         movq        [rdi + rdx],    xmm1
    686 
    687         lea         rdi,            [rdi + 2*rdx]
    688 
    689         movq        [rdi],          xmm2
    690         movq        [rdi + rdx],    xmm3
    691 
    692 
    693     ; begin epilog
    694     pop         rdi
    695     RESTORE_GOT
    696     RESTORE_XMM
    697     UNSHADOW_ARGS
    698     pop         rbp
    699     ret
    700 
    701 SECTION_RODATA
    702 align 16
    703 fours:
    704     times 8 dw 0x0004
    705 align 16
    706 x_s1sqr2:
    707     times 8 dw 0x8A8C
    708 align 16
    709 x_c1sqr2less1:
    710     times 8 dw 0x4E7B
    711