Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 %macro LF_ABS 2
     15         ; %1 value not preserved
     16         ; %2 value preserved
     17         ; output in %1
     18         movdqa      scratch1, %2            ; v2
     19 
     20         psubusb     scratch1, %1            ; v2 - v1
     21         psubusb     %1, %2                  ; v1 - v2
     22         por         %1, scratch1            ; abs(v2 - v1)
     23 %endmacro
     24 
     25 %macro LF_FILTER_HEV_MASK 8-9
     26 
     27         LF_ABS      %1, %2                  ; abs(p3 - p2)
     28         LF_ABS      %2, %3                  ; abs(p2 - p1)
     29         pmaxub      %1, %2                  ; accumulate mask
     30 %if %0 == 8
     31         movdqa      scratch2, %3            ; save p1
     32         LF_ABS      scratch2, %4            ; abs(p1 - p0)
     33 %endif
     34         LF_ABS      %4, %5                  ; abs(p0 - q0)
     35         LF_ABS      %5, %6                  ; abs(q0 - q1)
     36 %if %0 == 8
     37         pmaxub      %5, scratch2            ; accumulate hev
     38 %else
     39         pmaxub      %5, %9
     40 %endif
     41         pmaxub      %1, %5                  ; accumulate mask
     42 
     43         LF_ABS      %3, %6                  ; abs(p1 - q1)
     44         LF_ABS      %6, %7                  ; abs(q1 - q2)
     45         pmaxub      %1, %6                  ; accumulate mask
     46         LF_ABS      %7, %8                  ; abs(q2 - q3)
     47         pmaxub      %1, %7                  ; accumulate mask
     48 
     49         paddusb     %4, %4                  ; 2 * abs(p0 - q0)
     50         pand        %3, [GLOBAL(tfe)]
     51         psrlw       %3, 1                   ; abs(p1 - q1) / 2
     52         paddusb     %4, %3                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
     53 
     54         psubusb     %1, [limit]
     55         psubusb     %4, [blimit]
     56         por         %1, %4
     57         pcmpeqb     %1, zero                ; mask
     58 
     59         psubusb     %5, [thresh]
     60         pcmpeqb     %5, zero                ; ~hev
     61 %endmacro
     62 
     63 %macro LF_FILTER 6
     64         ; %1-%4: p1-q1
     65         ; %5: mask
     66         ; %6: hev
     67 
     68         movdqa      scratch2, %6            ; save hev
     69 
     70         pxor        %1, [GLOBAL(t80)]       ; ps1
     71         pxor        %4, [GLOBAL(t80)]       ; qs1
     72         movdqa      scratch1, %1
     73         psubsb      scratch1, %4            ; signed_char_clamp(ps1 - qs1)
     74         pandn       scratch2, scratch1      ; vp8_filter &= hev
     75 
     76         pxor        %2, [GLOBAL(t80)]       ; ps0
     77         pxor        %3, [GLOBAL(t80)]       ; qs0
     78         movdqa      scratch1, %3
     79         psubsb      scratch1, %2            ; qs0 - ps0
     80         paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
     81         paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
     82         paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
     83         pand        %5, scratch2            ; &= mask
     84 
     85         movdqa      scratch2, %5
     86         paddsb      %5, [GLOBAL(t4)]        ; Filter1
     87         paddsb      scratch2, [GLOBAL(t3)]  ; Filter2
     88 
     89         ; Filter1 >> 3
     90         movdqa      scratch1, zero
     91         pcmpgtb     scratch1, %5
     92         psrlw       %5, 3
     93         pand        scratch1, [GLOBAL(te0)]
     94         pand        %5, [GLOBAL(t1f)]
     95         por         %5, scratch1
     96 
     97         psubsb      %3, %5                  ; qs0 - Filter1
     98         pxor        %3, [GLOBAL(t80)]
     99 
    100         ; Filter2 >> 3
    101         movdqa      scratch1, zero
    102         pcmpgtb     scratch1, scratch2
    103         psrlw       scratch2, 3
    104         pand        scratch1, [GLOBAL(te0)]
    105         pand        scratch2, [GLOBAL(t1f)]
    106         por         scratch2, scratch1
    107 
    108         paddsb      %2, scratch2            ; ps0 + Filter2
    109         pxor        %2, [GLOBAL(t80)]
    110 
    111         ; outer tap adjustments
    112         paddsb      %5, [GLOBAL(t1)]
    113         movdqa      scratch1, zero
    114         pcmpgtb     scratch1, %5
    115         psrlw       %5, 1
    116         pand        scratch1, [GLOBAL(t80)]
    117         pand        %5, [GLOBAL(t7f)]
    118         por         %5, scratch1
    119         pand        %5, %6                  ; vp8_filter &= ~hev
    120 
    121         psubsb      %4, %5                  ; qs1 - vp8_filter
    122         pxor        %4, [GLOBAL(t80)]
    123 
    124         paddsb      %1, %5                  ; ps1 + vp8_filter
    125         pxor        %1, [GLOBAL(t80)]
    126 %endmacro
    127 
    128 ;void vp8_loop_filter_bh_y_sse2
    129 ;(
    130 ;    unsigned char *src_ptr,
    131 ;    int            src_pixel_step,
    132 ;    const char    *blimit,
    133 ;    const char    *limit,
    134 ;    const char    *thresh
    135 ;)
    136 global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
    137 sym(vp8_loop_filter_bh_y_sse2):
    138 
    139 %if LIBVPX_YASM_WIN64
    140     %define src      rcx ; src_ptr
    141     %define stride   rdx ; src_pixel_step
    142     %define blimit   r8
    143     %define limit    r9
    144     %define thresh   r10
    145 
    146     %define spp      rax
    147     %define stride3  r11
    148     %define stride5  r12
    149     %define stride7  r13
    150 
    151     push    rbp
    152     mov     rbp, rsp
    153     SAVE_XMM 11
    154     push    r12
    155     push    r13
    156     mov     thresh, arg(4)
    157 %else
    158     %define src      rdi ; src_ptr
    159     %define stride   rsi ; src_pixel_step
    160     %define blimit   rdx
    161     %define limit    rcx
    162     %define thresh   r8
    163 
    164     %define spp      rax
    165     %define stride3  r9
    166     %define stride5  r10
    167     %define stride7  r11
    168 %endif
    169 
    170     %define scratch1 xmm5
    171     %define scratch2 xmm6
    172     %define zero     xmm7
    173 
    174     %define i0       [src]
    175     %define i1       [spp]
    176     %define i2       [src + 2 * stride]
    177     %define i3       [spp + 2 * stride]
    178     %define i4       [src + 4 * stride]
    179     %define i5       [spp + 4 * stride]
    180     %define i6       [src + 2 * stride3]
    181     %define i7       [spp + 2 * stride3]
    182     %define i8       [src + 8 * stride]
    183     %define i9       [spp + 8 * stride]
    184     %define i10      [src + 2 * stride5]
    185     %define i11      [spp + 2 * stride5]
    186     %define i12      [src + 4 * stride3]
    187     %define i13      [spp + 4 * stride3]
    188     %define i14      [src + 2 * stride7]
    189     %define i15      [spp + 2 * stride7]
    190 
    191     ; prep work
    192     lea         spp, [src + stride]
    193     lea         stride3, [stride + 2 * stride]
    194     lea         stride5, [stride3 + 2 * stride]
    195     lea         stride7, [stride3 + 4 * stride]
    196     pxor        zero, zero
    197 
    198         ; load the first set into registers
    199         movdqa       xmm0, i0
    200         movdqa       xmm1, i1
    201         movdqa       xmm2, i2
    202         movdqa       xmm3, i3
    203         movdqa       xmm4, i4
    204         movdqa       xmm8, i5
    205         movdqa       xmm9, i6   ; q2, will contain abs(p1-p0)
    206         movdqa       xmm10, i7
    207 LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
    208 
    209         movdqa       xmm1, i2
    210         movdqa       xmm2, i3
    211         movdqa       xmm3, i4
    212         movdqa       xmm8, i5
    213 LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
    214         movdqa       i2, xmm1
    215         movdqa       i3, xmm2
    216 
    217 ; second set
    218         movdqa       i4, xmm3
    219         movdqa       i5, xmm8
    220 
    221         movdqa       xmm0, i6
    222         movdqa       xmm1, i7
    223         movdqa       xmm2, i8
    224         movdqa       xmm4, i9
    225         movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)
    226         movdqa       xmm11, i11
    227 LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
    228 
    229         movdqa       xmm0, i6
    230         movdqa       xmm1, i7
    231         movdqa       xmm4, i8
    232         movdqa       xmm8, i9
    233 LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
    234         movdqa       i6, xmm0
    235         movdqa       i7, xmm1
    236 
    237 ; last set
    238         movdqa       i8, xmm4
    239         movdqa       i9, xmm8
    240 
    241         movdqa       xmm0, i10
    242         movdqa       xmm1, i11
    243         movdqa       xmm2, i12
    244         movdqa       xmm3, i13
    245         movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)
    246         movdqa       xmm11, i15
    247 LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
    248 
    249         movdqa       xmm0, i10
    250         movdqa       xmm1, i11
    251         movdqa       xmm3, i12
    252         movdqa       xmm8, i13
    253 LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
    254         movdqa       i10, xmm0
    255         movdqa       i11, xmm1
    256         movdqa       i12, xmm3
    257         movdqa       i13, xmm8
    258 
    259 %if LIBVPX_YASM_WIN64
    260     pop    r13
    261     pop    r12
    262     RESTORE_XMM
    263     pop    rbp
    264 %endif
    265 
    266     ret
    267 
    268 
    269 ;void vp8_loop_filter_bv_y_sse2
    270 ;(
    271 ;    unsigned char *src_ptr,
    272 ;    int            src_pixel_step,
    273 ;    const char    *blimit,
    274 ;    const char    *limit,
    275 ;    const char    *thresh
    276 ;)
    277 
    278 global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
    279 sym(vp8_loop_filter_bv_y_sse2):
    280 
    281 %if LIBVPX_YASM_WIN64
    282     %define src      rcx ; src_ptr
    283     %define stride   rdx ; src_pixel_step
    284     %define blimit   r8
    285     %define limit    r9
    286     %define thresh   r10
    287 
    288     %define spp      rax
    289     %define stride3  r11
    290     %define stride5  r12
    291     %define stride7  r13
    292 
    293     push    rbp
    294     mov     rbp, rsp
    295     SAVE_XMM 15
    296     push    r12
    297     push    r13
    298     mov     thresh, arg(4)
    299 %else
    300     %define src      rdi
    301     %define stride   rsi
    302     %define blimit   rdx
    303     %define limit    rcx
    304     %define thresh   r8
    305 
    306     %define spp      rax
    307     %define stride3  r9
    308     %define stride5  r10
    309     %define stride7  r11
    310 %endif
    311 
    312     %define scratch1 xmm5
    313     %define scratch2 xmm6
    314     %define zero     xmm7
    315 
    316     %define s0       [src]
    317     %define s1       [spp]
    318     %define s2       [src + 2 * stride]
    319     %define s3       [spp + 2 * stride]
    320     %define s4       [src + 4 * stride]
    321     %define s5       [spp + 4 * stride]
    322     %define s6       [src + 2 * stride3]
    323     %define s7       [spp + 2 * stride3]
    324     %define s8       [src + 8 * stride]
    325     %define s9       [spp + 8 * stride]
    326     %define s10      [src + 2 * stride5]
    327     %define s11      [spp + 2 * stride5]
    328     %define s12      [src + 4 * stride3]
    329     %define s13      [spp + 4 * stride3]
    330     %define s14      [src + 2 * stride7]
    331     %define s15      [spp + 2 * stride7]
    332 
    333     %define i0       [rsp]
    334     %define i1       [rsp + 16]
    335     %define i2       [rsp + 32]
    336     %define i3       [rsp + 48]
    337     %define i4       [rsp + 64]
    338     %define i5       [rsp + 80]
    339     %define i6       [rsp + 96]
    340     %define i7       [rsp + 112]
    341     %define i8       [rsp + 128]
    342     %define i9       [rsp + 144]
    343     %define i10      [rsp + 160]
    344     %define i11      [rsp + 176]
    345     %define i12      [rsp + 192]
    346     %define i13      [rsp + 208]
    347     %define i14      [rsp + 224]
    348     %define i15      [rsp + 240]
    349 
    350     ALIGN_STACK 16, rax
    351 
    352     ; reserve stack space
    353     %define      temp_storage  0 ; size is 256 (16*16)
    354     %define      stack_size 256
    355     sub          rsp, stack_size
    356 
    357     ; prep work
    358     lea         spp, [src + stride]
    359     lea         stride3, [stride + 2 * stride]
    360     lea         stride5, [stride3 + 2 * stride]
    361     lea         stride7, [stride3 + 4 * stride]
    362 
    363         ; 8-f
    364         movdqa      xmm0, s8
    365         movdqa      xmm1, xmm0
    366         punpcklbw   xmm0, s9                ; 80 90
    367         punpckhbw   xmm1, s9                ; 88 98
    368 
    369         movdqa      xmm2, s10
    370         movdqa      xmm3, xmm2
    371         punpcklbw   xmm2, s11 ; a0 b0
    372         punpckhbw   xmm3, s11 ; a8 b8
    373 
    374         movdqa      xmm4, xmm0
    375         punpcklwd   xmm0, xmm2              ; 80 90 a0 b0
    376         punpckhwd   xmm4, xmm2              ; 84 94 a4 b4
    377 
    378         movdqa      xmm2, xmm1
    379         punpcklwd   xmm1, xmm3              ; 88 98 a8 b8
    380         punpckhwd   xmm2, xmm3              ; 8c 9c ac bc
    381 
    382         ; using xmm[0124]
    383         ; work on next 4 rows
    384 
    385         movdqa      xmm3, s12
    386         movdqa      xmm5, xmm3
    387         punpcklbw   xmm3, s13 ; c0 d0
    388         punpckhbw   xmm5, s13 ; c8 d8
    389 
    390         movdqa      xmm6, s14
    391         movdqa      xmm7, xmm6
    392         punpcklbw   xmm6, s15 ; e0 f0
    393         punpckhbw   xmm7, s15 ; e8 f8
    394 
    395         movdqa      xmm8, xmm3
    396         punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0
    397         punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4
    398 
    399         movdqa      xmm6, xmm5
    400         punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8
    401         punpckhwd   xmm6, xmm7              ; cc dc ec fc
    402 
    403         ; pull the third and fourth sets together
    404 
    405         movdqa      xmm7, xmm0
    406         punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0
    407         punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2
    408 
    409         movdqa      xmm3, xmm4
    410         punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4
    411         punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6
    412 
    413         movdqa      xmm8, xmm1
    414         punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8
    415         punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa
    416 
    417         movdqa      xmm5, xmm2
    418         punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc
    419         punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe
    420 
    421         ; save the calculations. we only have 15 registers ...
    422         movdqa      i0, xmm0
    423         movdqa      i1, xmm7
    424         movdqa      i2, xmm4
    425         movdqa      i3, xmm3
    426         movdqa      i4, xmm1
    427         movdqa      i5, xmm8
    428         movdqa      i6, xmm2
    429         movdqa      i7, xmm5
    430 
    431         ; 0-7
    432         movdqa      xmm0, s0
    433         movdqa      xmm1, xmm0
    434         punpcklbw   xmm0, s1 ; 00 10
    435         punpckhbw   xmm1, s1 ; 08 18
    436 
    437         movdqa      xmm2, s2
    438         movdqa      xmm3, xmm2
    439         punpcklbw   xmm2, s3 ; 20 30
    440         punpckhbw   xmm3, s3 ; 28 38
    441 
    442         movdqa      xmm4, xmm0
    443         punpcklwd   xmm0, xmm2              ; 00 10 20 30
    444         punpckhwd   xmm4, xmm2              ; 04 14 24 34
    445 
    446         movdqa      xmm2, xmm1
    447         punpcklwd   xmm1, xmm3              ; 08 18 28 38
    448         punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c
    449 
    450         ; using xmm[0124]
    451         ; work on next 4 rows
    452 
    453         movdqa      xmm3, s4
    454         movdqa      xmm5, xmm3
    455         punpcklbw   xmm3, s5 ; 40 50
    456         punpckhbw   xmm5, s5 ; 48 58
    457 
    458         movdqa      xmm6, s6
    459         movdqa      xmm7, xmm6
    460         punpcklbw   xmm6, s7   ; 60 70
    461         punpckhbw   xmm7, s7   ; 68 78
    462 
    463         movdqa      xmm8, xmm3
    464         punpcklwd   xmm3, xmm6              ; 40 50 60 70
    465         punpckhwd   xmm8, xmm6              ; 44 54 64 74
    466 
    467         movdqa      xmm6, xmm5
    468         punpcklwd   xmm5, xmm7              ; 48 58 68 78
    469         punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c
    470 
    471         ; pull the first two sets together
    472 
    473         movdqa      xmm7, xmm0
    474         punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70
    475         punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72
    476 
    477         movdqa      xmm3, xmm4
    478         punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74
    479         punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76
    480 
    481         movdqa      xmm8, xmm1
    482         punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78
    483         punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a
    484 
    485         movdqa      xmm5, xmm2
    486         punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c
    487         punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e
    488         ; final combination
    489 
    490         movdqa      xmm6, xmm0
    491         punpcklqdq  xmm0, i0
    492         punpckhqdq  xmm6, i0
    493 
    494         movdqa      xmm9, xmm7
    495         punpcklqdq  xmm7, i1
    496         punpckhqdq  xmm9, i1
    497 
    498         movdqa      xmm10, xmm4
    499         punpcklqdq  xmm4, i2
    500         punpckhqdq  xmm10, i2
    501 
    502         movdqa      xmm11, xmm3
    503         punpcklqdq  xmm3, i3
    504         punpckhqdq  xmm11, i3
    505 
    506         movdqa      xmm12, xmm1
    507         punpcklqdq  xmm1, i4
    508         punpckhqdq  xmm12, i4
    509 
    510         movdqa      xmm13, xmm8
    511         punpcklqdq  xmm8, i5
    512         punpckhqdq  xmm13, i5
    513 
    514         movdqa      xmm14, xmm2
    515         punpcklqdq  xmm2, i6
    516         punpckhqdq  xmm14, i6
    517 
    518         movdqa      xmm15, xmm5
    519         punpcklqdq  xmm5, i7
    520         punpckhqdq  xmm15, i7
    521 
    522         movdqa      i0, xmm0
    523         movdqa      i1, xmm6
    524         movdqa      i2, xmm7
    525         movdqa      i3, xmm9
    526         movdqa      i4, xmm4
    527         movdqa      i5, xmm10
    528         movdqa      i6, xmm3
    529         movdqa      i7, xmm11
    530         movdqa      i8, xmm1
    531         movdqa      i9, xmm12
    532         movdqa      i10, xmm8
    533         movdqa      i11, xmm13
    534         movdqa      i12, xmm2
    535         movdqa      i13, xmm14
    536         movdqa      i14, xmm5
    537         movdqa      i15, xmm15
    538 
    539 ; TRANSPOSED DATA AVAILABLE ON THE STACK
    540 
    541         movdqa      xmm12, xmm6
    542         movdqa      xmm13, xmm7
    543 
    544         pxor        zero, zero
    545 
    546 LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
    547 
    548         movdqa       xmm1, i2
    549         movdqa       xmm2, i3
    550         movdqa       xmm8, i4
    551         movdqa       xmm9, i5
    552 LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
    553         movdqa       i2, xmm1
    554         movdqa       i3, xmm2
    555 
    556 ; second set
    557         movdqa       i4, xmm8
    558         movdqa       i5, xmm9
    559 
    560         movdqa       xmm0, i6
    561         movdqa       xmm1, i7
    562         movdqa       xmm2, i8
    563         movdqa       xmm4, i9
    564         movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)
    565         movdqa       xmm11, i11
    566 LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
    567 
    568         movdqa       xmm0, i6
    569         movdqa       xmm1, i7
    570         movdqa       xmm3, i8
    571         movdqa       xmm4, i9
    572 LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
    573         movdqa       i6, xmm0
    574         movdqa       i7, xmm1
    575 
    576 ; last set
    577         movdqa       i8, xmm3
    578         movdqa       i9, xmm4
    579 
    580         movdqa       xmm0, i10
    581         movdqa       xmm1, i11
    582         movdqa       xmm2, i12
    583         movdqa       xmm8, i13
    584         movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)
    585         movdqa       xmm11, i15
    586 LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
    587 
    588         movdqa       xmm0, i10
    589         movdqa       xmm1, i11
    590         movdqa       xmm4, i12
    591         movdqa       xmm8, i13
    592 LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
    593         movdqa       i10, xmm0
    594         movdqa       i11, xmm1
    595         movdqa       i12, xmm4
    596         movdqa       i13, xmm8
    597 
    598 
    599 ; RESHUFFLE AND WRITE OUT
    600         ; 8-f
    601         movdqa      xmm0, i8
    602         movdqa      xmm1, xmm0
    603         punpcklbw   xmm0, i9                ; 80 90
    604         punpckhbw   xmm1, i9                ; 88 98
    605 
    606         movdqa      xmm2, i10
    607         movdqa      xmm3, xmm2
    608         punpcklbw   xmm2, i11               ; a0 b0
    609         punpckhbw   xmm3, i11               ; a8 b8
    610 
    611         movdqa      xmm4, xmm0
    612         punpcklwd   xmm0, xmm2              ; 80 90 a0 b0
    613         punpckhwd   xmm4, xmm2              ; 84 94 a4 b4
    614 
    615         movdqa      xmm2, xmm1
    616         punpcklwd   xmm1, xmm3              ; 88 98 a8 b8
    617         punpckhwd   xmm2, xmm3              ; 8c 9c ac bc
    618 
    619         ; using xmm[0124]
    620         ; work on next 4 rows
    621 
    622         movdqa      xmm3, i12
    623         movdqa      xmm5, xmm3
    624         punpcklbw   xmm3, i13               ; c0 d0
    625         punpckhbw   xmm5, i13               ; c8 d8
    626 
    627         movdqa      xmm6, i14
    628         movdqa      xmm7, xmm6
    629         punpcklbw   xmm6, i15               ; e0 f0
    630         punpckhbw   xmm7, i15               ; e8 f8
    631 
    632         movdqa      xmm8, xmm3
    633         punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0
    634         punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4
    635 
    636         movdqa      xmm6, xmm5
    637         punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8
    638         punpckhwd   xmm6, xmm7              ; cc dc ec fc
    639 
    640         ; pull the third and fourth sets together
    641 
    642         movdqa      xmm7, xmm0
    643         punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0
    644         punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2
    645 
    646         movdqa      xmm3, xmm4
    647         punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4
    648         punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6
    649 
    650         movdqa      xmm8, xmm1
    651         punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8
    652         punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa
    653 
    654         movdqa      xmm5, xmm2
    655         punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc
    656         punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe
    657 
    658         ; save the calculations. we only have 15 registers ...
    659         movdqa      i8, xmm0
    660         movdqa      i9, xmm7
    661         movdqa      i10, xmm4
    662         movdqa      i11, xmm3
    663         movdqa      i12, xmm1
    664         movdqa      i13, xmm8
    665         movdqa      i14, xmm2
    666         movdqa      i15, xmm5
    667 
    668         ; 0-7
    669         movdqa      xmm0, i0
    670         movdqa      xmm1, xmm0
    671         punpcklbw   xmm0, i1                ; 00 10
    672         punpckhbw   xmm1, i1                ; 08 18
    673 
    674         movdqa      xmm2, i2
    675         movdqa      xmm3, xmm2
    676         punpcklbw   xmm2, i3                ; 20 30
    677         punpckhbw   xmm3, i3                ; 28 38
    678 
    679         movdqa      xmm4, xmm0
    680         punpcklwd   xmm0, xmm2              ; 00 10 20 30
    681         punpckhwd   xmm4, xmm2              ; 04 14 24 34
    682 
    683         movdqa      xmm2, xmm1
    684         punpcklwd   xmm1, xmm3              ; 08 18 28 38
    685         punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c
    686 
    687         ; using xmm[0124]
    688         ; work on next 4 rows
    689 
    690         movdqa      xmm3, i4
    691         movdqa      xmm5, xmm3
    692         punpcklbw   xmm3, i5                ; 40 50
    693         punpckhbw   xmm5, i5                ; 48 58
    694 
    695         movdqa      xmm6, i6
    696         movdqa      xmm7, xmm6
    697         punpcklbw   xmm6, i7                ; 60 70
    698         punpckhbw   xmm7, i7                ; 68 78
    699 
    700         movdqa      xmm8, xmm3
    701         punpcklwd   xmm3, xmm6              ; 40 50 60 70
    702         punpckhwd   xmm8, xmm6              ; 44 54 64 74
    703 
    704         movdqa      xmm6, xmm5
    705         punpcklwd   xmm5, xmm7              ; 48 58 68 78
    706         punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c
    707 
    708         ; pull the first two sets together
    709 
    710         movdqa      xmm7, xmm0
    711         punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70
    712         punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72
    713 
    714         movdqa      xmm3, xmm4
    715         punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74
    716         punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76
    717 
    718         movdqa      xmm8, xmm1
    719         punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78
    720         punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a
    721 
    722         movdqa      xmm5, xmm2
    723         punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c
    724         punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e
    725         ; final combination
    726 
    727         movdqa      xmm6, xmm0
    728         punpcklqdq  xmm0, i8
    729         punpckhqdq  xmm6, i8
    730 
    731         movdqa      xmm9, xmm7
    732         punpcklqdq  xmm7, i9
    733         punpckhqdq  xmm9, i9
    734 
    735         movdqa      xmm10, xmm4
    736         punpcklqdq  xmm4, i10
    737         punpckhqdq  xmm10, i10
    738 
    739         movdqa      xmm11, xmm3
    740         punpcklqdq  xmm3, i11
    741         punpckhqdq  xmm11, i11
    742 
    743         movdqa      xmm12, xmm1
    744         punpcklqdq  xmm1, i12
    745         punpckhqdq  xmm12, i12
    746 
    747         movdqa      xmm13, xmm8
    748         punpcklqdq  xmm8, i13
    749         punpckhqdq  xmm13, i13
    750 
    751         movdqa      xmm14, xmm2
    752         punpcklqdq  xmm2, i14
    753         punpckhqdq  xmm14, i14
    754 
    755         movdqa      xmm15, xmm5
    756         punpcklqdq  xmm5, i15
    757         punpckhqdq  xmm15, i15
    758 
    759         movdqa      s0, xmm0
    760         movdqa      s1, xmm6
    761         movdqa      s2, xmm7
    762         movdqa      s3, xmm9
    763         movdqa      s4, xmm4
    764         movdqa      s5, xmm10
    765         movdqa      s6, xmm3
    766         movdqa      s7, xmm11
    767         movdqa      s8, xmm1
    768         movdqa      s9, xmm12
    769         movdqa      s10, xmm8
    770         movdqa      s11, xmm13
    771         movdqa      s12, xmm2
    772         movdqa      s13, xmm14
    773         movdqa      s14, xmm5
    774         movdqa      s15, xmm15
    775 
    776     ; free stack space
    777     add          rsp, stack_size
    778 
    779     ; un-ALIGN_STACK
    780     pop          rsp
    781 
    782 %if LIBVPX_YASM_WIN64
    783     pop    r13
    784     pop    r12
    785     RESTORE_XMM
    786     pop    rbp
    787 %endif
    788 
    789     ret
    790 
    791 SECTION_RODATA
    792 align 16
    793 te0:
    794     times 16 db 0xe0
    795 align 16
    796 t7f:
    797     times 16 db 0x7f
    798 align 16
    799 tfe:
    800     times 16 db 0xfe
    801 align 16
    802 t1f:
    803     times 16 db 0x1f
    804 align 16
    805 t80:
    806     times 16 db 0x80
    807 align 16
    808 t1:
    809     times 16 db 0x01
    810 align 16
    811 t3:
    812     times 16 db 0x03
    813 align 16
    814 t4:
    815     times 16 db 0x04
    816