Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 %define _t0 0
     14 %define _t1 _t0 + 16
     15 %define _p3 _t1 + 16
     16 %define _p2 _p3 + 16
     17 %define _p1 _p2 + 16
     18 %define _p0 _p1 + 16
     19 %define _q0 _p0 + 16
     20 %define _q1 _q0 + 16
     21 %define _q2 _q1 + 16
     22 %define _q3 _q2 + 16
     23 %define lf_var_size 160
     24 
     25 ; Use of pmaxub instead of psubusb to compute filter mask was seen
     26 ; in ffvp8
     27 
     28 %macro LFH_FILTER_AND_HEV_MASK 1
     29 %if %1
     30         movdqa      xmm2,                   [rdi+2*rax]       ; q3
     31         movdqa      xmm1,                   [rsi+2*rax]       ; q2
     32         movdqa      xmm4,                   [rsi+rax]         ; q1
     33         movdqa      xmm5,                   [rsi]             ; q0
     34         neg         rax                     ; negate pitch to deal with above border
     35 %else
     36         movlps      xmm2,                   [rsi + rcx*2]     ; q3
     37         movlps      xmm1,                   [rsi + rcx]       ; q2
     38         movlps      xmm4,                   [rsi]             ; q1
     39         movlps      xmm5,                   [rsi + rax]       ; q0
     40 
     41         movhps      xmm2,                   [rdi + rcx*2]
     42         movhps      xmm1,                   [rdi + rcx]
     43         movhps      xmm4,                   [rdi]
     44         movhps      xmm5,                   [rdi + rax]
     45 
     46         lea         rsi,                    [rsi + rax*4]
     47         lea         rdi,                    [rdi + rax*4]
     48 
     49         movdqa      [rsp+_q2],              xmm1              ; store q2
     50         movdqa      [rsp+_q1],              xmm4              ; store q1
     51 %endif
     52         movdqa      xmm7,                   [rdx]             ;limit
     53 
     54         movdqa      xmm6,                   xmm1              ; q2
     55         movdqa      xmm3,                   xmm4              ; q1
     56 
     57         psubusb     xmm1,                   xmm2              ; q2-=q3
     58         psubusb     xmm2,                   xmm6              ; q3-=q2
     59 
     60         psubusb     xmm4,                   xmm6              ; q1-=q2
     61         psubusb     xmm6,                   xmm3              ; q2-=q1
     62 
     63         por         xmm4,                   xmm6              ; abs(q2-q1)
     64         por         xmm1,                   xmm2              ; abs(q3-q2)
     65 
     66         movdqa      xmm0,                   xmm5              ; q0
     67         pmaxub      xmm1,                   xmm4
     68 
     69         psubusb     xmm5,                   xmm3              ; q0-=q1
     70         psubusb     xmm3,                   xmm0              ; q1-=q0
     71 
     72         por         xmm5,                   xmm3              ; abs(q0-q1)
     73         movdqa      [rsp+_t0],              xmm5              ; save to t0
     74 
     75         pmaxub      xmm1,                   xmm5
     76 
     77 %if %1
     78         movdqa      xmm2,                   [rsi+4*rax]       ; p3
     79         movdqa      xmm4,                   [rdi+4*rax]       ; p2
     80         movdqa      xmm6,                   [rsi+2*rax]       ; p1
     81 %else
     82         movlps      xmm2,                   [rsi + rax]       ; p3
     83         movlps      xmm4,                   [rsi]             ; p2
     84         movlps      xmm6,                   [rsi + rcx]       ; p1
     85 
     86         movhps      xmm2,                   [rdi + rax]
     87         movhps      xmm4,                   [rdi]
     88         movhps      xmm6,                   [rdi + rcx]
     89 
     90         movdqa      [rsp+_p2],              xmm4              ; store p2
     91         movdqa      [rsp+_p1],              xmm6              ; store p1
     92 %endif
     93 
     94         movdqa      xmm5,                   xmm4              ; p2
     95         movdqa      xmm3,                   xmm6              ; p1
     96 
     97         psubusb     xmm4,                   xmm2              ; p2-=p3
     98         psubusb     xmm2,                   xmm5              ; p3-=p2
     99 
    100         psubusb     xmm3,                   xmm5              ; p1-=p2
    101         pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
    102 
    103         psubusb     xmm5,                   xmm6              ; p2-=p1
    104         pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
    105 
    106         pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
    107         movdqa      xmm2,                   xmm6              ; p1
    108 
    109         pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
    110 %if %1
    111         movdqa      xmm4,                   [rsi+rax]         ; p0
    112         movdqa      xmm3,                   [rdi]             ; q1
    113 %else
    114         movlps      xmm4,                   [rsi + rcx*2]     ; p0
    115         movhps      xmm4,                   [rdi + rcx*2]
    116         movdqa      xmm3,                   [rsp+_q1]                ; q1
    117 %endif
    118 
    119         movdqa      xmm5,                   xmm4              ; p0
    120         psubusb     xmm4,                   xmm6              ; p0-=p1
    121 
    122         psubusb     xmm6,                   xmm5              ; p1-=p0
    123 
    124         por         xmm6,                   xmm4              ; abs(p1 - p0)
    125         mov         rdx,                    arg(2)            ; get blimit
    126 
    127         movdqa     [rsp+_t1],               xmm6              ; save to t1
    128 
    129         movdqa      xmm4,                   xmm3              ; q1
    130         pmaxub      xmm1,                   xmm6
    131 
    132         psubusb     xmm3,                   xmm2              ; q1-=p1
    133         psubusb     xmm2,                   xmm4              ; p1-=q1
    134 
    135         psubusb     xmm1,                   xmm7
    136         por         xmm2,                   xmm3              ; abs(p1-q1)
    137 
    138         movdqa      xmm7,                   [rdx]             ; blimit
    139         mov         rdx,                    arg(4)            ; hev get thresh
    140 
    141         movdqa      xmm3,                   xmm0              ; q0
    142         pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
    143 
    144         movdqa      xmm6,                   xmm5              ; p0
    145         psrlw       xmm2,                   1                 ; abs(p1-q1)/2
    146 
    147         psubusb     xmm5,                   xmm3              ; p0-=q0
    148         psubusb     xmm3,                   xmm6              ; q0-=p0
    149         por         xmm5,                   xmm3              ; abs(p0 - q0)
    150 
    151         paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
    152 
    153         movdqa      xmm4,                   [rsp+_t0]                ; hev get abs (q1 - q0)
    154         movdqa      xmm3,                   [rsp+_t1]                ; get abs (p1 - p0)
    155 
    156         paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
    157 
    158         movdqa      xmm2,                   [rdx]             ; hev
    159 
    160         psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
    161         psubusb     xmm4,                   xmm2              ; hev
    162 
    163         psubusb     xmm3,                   xmm2              ; hev
    164         por         xmm1,                   xmm5
    165 
    166         pxor        xmm7,                   xmm7
    167         paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
    168 
    169         pcmpeqb     xmm4,                   xmm5              ; hev
    170         pcmpeqb     xmm3,                   xmm3              ; hev
    171 
    172         pcmpeqb     xmm1,                   xmm7              ; mask xmm1
    173         pxor        xmm4,                   xmm3              ; hev
    174 %endmacro
    175 
    176 %macro B_FILTER 1
    177         movdqa      xmm3,                   [GLOBAL(t80)]
    178 %if %1 == 0
    179         movdqa      xmm2,                   [rsp+_p1]                ; p1
    180         movdqa      xmm7,                   [rsp+_q1]                ; q1
    181 %elif %1 == 1
    182         movdqa      xmm2,                   [rsi+2*rax]       ; p1
    183         movdqa      xmm7,                   [rdi]             ; q1
    184 %elif %1 == 2
    185         movdqa      xmm2,                   [rsp+_p1]         ; p1
    186         movdqa      xmm6,                   [rsp+_p0]         ; p0
    187         movdqa      xmm0,                   [rsp+_q0]         ; q0
    188         movdqa      xmm7,                   [rsp+_q1]         ; q1
    189 %endif
    190 
    191         pxor        xmm2,                   xmm3              ; p1 offset to convert to signed values
    192         pxor        xmm7,                   xmm3              ; q1 offset to convert to signed values
    193 
    194         psubsb      xmm2,                   xmm7              ; p1 - q1
    195         pxor        xmm6,                   xmm3              ; offset to convert to signed values
    196 
    197         pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
    198         pxor        xmm0,                   xmm3              ; offset to convert to signed values
    199 
    200         movdqa      xmm3,                   xmm0              ; q0
    201         psubsb      xmm0,                   xmm6              ; q0 - p0
    202         paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
    203         paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
    204         paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
    205         pand        xmm1,                   xmm2              ; mask filter values we don't care about
    206 
    207         movdqa      xmm2,                   xmm1
    208         paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
    209         paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
    210 
    211         punpckhbw   xmm5,                   xmm2              ; axbxcxdx
    212         punpcklbw   xmm2,                   xmm2              ; exfxgxhx
    213 
    214         punpcklbw   xmm0,                   xmm1              ; exfxgxhx
    215         psraw       xmm5,                   11                ; sign extended shift right by 3
    216 
    217         punpckhbw   xmm1,                   xmm1              ; axbxcxdx
    218         psraw       xmm2,                   11                ; sign extended shift right by 3
    219 
    220         packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
    221         psraw       xmm0,                   11                ; sign extended shift right by 3
    222 
    223         psraw       xmm1,                   11                ; sign extended shift right by 3
    224         movdqa      xmm5,                   xmm0              ; save results
    225 
    226         packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
    227 
    228         paddsb      xmm6,                   xmm2              ; p0+= p0 add
    229 
    230         movdqa      xmm2,                   [GLOBAL(ones)]
    231         paddsw      xmm5,                   xmm2
    232         paddsw      xmm1,                   xmm2
    233         psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
    234         psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
    235         packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
    236         movdqa      xmm2,                   [GLOBAL(t80)]
    237 
    238 %if %1 == 0
    239         movdqa      xmm1,                   [rsp+_p1]         ; p1
    240         lea         rsi,                    [rsi + rcx*2]
    241         lea         rdi,                    [rdi + rcx*2]
    242 %elif %1 == 1
    243         movdqa      xmm1,                   [rsi+2*rax]       ; p1
    244 %elif %1 == 2
    245         movdqa      xmm1,                   [rsp+_p1]         ; p1
    246 %endif
    247 
    248         pandn       xmm4,                   xmm5              ; high edge variance additive
    249         pxor        xmm6,                   xmm2              ; unoffset
    250 
    251         pxor        xmm1,                   xmm2              ; reoffset
    252         psubsb      xmm3,                   xmm0              ; q0-= q0 add
    253 
    254         paddsb      xmm1,                   xmm4              ; p1+= p1 add
    255         pxor        xmm3,                   xmm2              ; unoffset
    256 
    257         pxor        xmm1,                   xmm2              ; unoffset
    258         psubsb      xmm7,                   xmm4              ; q1-= q1 add
    259 
    260         pxor        xmm7,                   xmm2              ; unoffset
    261 %if %1 == 0
    262         movq        [rsi],                  xmm6              ; p0
    263         movhps      [rdi],                  xmm6
    264         movq        [rsi + rax],            xmm1              ; p1
    265         movhps      [rdi + rax],            xmm1
    266         movq        [rsi + rcx],            xmm3              ; q0
    267         movhps      [rdi + rcx],            xmm3
    268         movq        [rsi + rcx*2],          xmm7              ; q1
    269         movhps      [rdi + rcx*2],          xmm7
    270 %elif %1 == 1
    271         movdqa      [rsi+rax],              xmm6              ; write back
    272         movdqa      [rsi+2*rax],            xmm1              ; write back
    273         movdqa      [rsi],                  xmm3              ; write back
    274         movdqa      [rdi],                  xmm7              ; write back
    275 %endif
    276 
    277 %endmacro
    278 
    279 %if ABI_IS_32BIT
    280 
    281 ;void vp8_loop_filter_horizontal_edge_sse2
    282 ;(
    283 ;    unsigned char *src_ptr,
    284 ;    int            src_pixel_step,
    285 ;    const char    *blimit,
    286 ;    const char    *limit,
    287 ;    const char    *thresh,
    288 ;)
    289 global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE
    290 sym(vp8_loop_filter_horizontal_edge_sse2):
    291     push        rbp
    292     mov         rbp, rsp
    293     SHADOW_ARGS_TO_STACK 5
    294     SAVE_XMM 7
    295     GET_GOT     rbx
    296     push        rsi
    297     push        rdi
    298     ; end prolog
    299 
    300     ALIGN_STACK 16, rax
    301     sub         rsp, lf_var_size
    302 
    303         mov         rsi,                    arg(0)           ;src_ptr
    304         movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
    305 
    306         mov         rdx,                    arg(3)           ;limit
    307 
    308         lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
    309 
    310         ; calculate breakout conditions and high edge variance
    311         LFH_FILTER_AND_HEV_MASK 1
    312         ; filter and write back the result
    313         B_FILTER 1
    314 
    315     add rsp, lf_var_size
    316     pop rsp
    317     ; begin epilog
    318     pop rdi
    319     pop rsi
    320     RESTORE_GOT
    321     RESTORE_XMM
    322     UNSHADOW_ARGS
    323     pop         rbp
    324     ret
    325 
    326 %endif
    327 
    328 ;void vp8_loop_filter_horizontal_edge_uv_sse2
    329 ;(
    330 ;    unsigned char *src_ptr,
    331 ;    int            src_pixel_step,
    332 ;    const char    *blimit,
    333 ;    const char    *limit,
    334 ;    const char    *thresh,
    335 ;    int            count
    336 ;)
    337 global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE
    338 sym(vp8_loop_filter_horizontal_edge_uv_sse2):
    339     push        rbp
    340     mov         rbp, rsp
    341     SHADOW_ARGS_TO_STACK 6
    342     SAVE_XMM 7
    343     GET_GOT     rbx
    344     push        rsi
    345     push        rdi
    346     ; end prolog
    347 
    348     ALIGN_STACK 16, rax
    349     sub         rsp, lf_var_size
    350 
    351         mov         rsi,                    arg(0)             ; u
    352         mov         rdi,                    arg(5)             ; v
    353         movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
    354         mov         rcx,                    rax
    355         neg         rax                     ; negate pitch to deal with above border
    356 
    357         mov         rdx,                    arg(3)             ;limit
    358 
    359         lea         rsi,                    [rsi + rcx]
    360         lea         rdi,                    [rdi + rcx]
    361 
    362         ; calculate breakout conditions and high edge variance
    363         LFH_FILTER_AND_HEV_MASK 0
    364         ; filter and write back the result
    365         B_FILTER 0
    366 
    367     add rsp, lf_var_size
    368     pop rsp
    369     ; begin epilog
    370     pop rdi
    371     pop rsi
    372     RESTORE_GOT
    373     RESTORE_XMM
    374     UNSHADOW_ARGS
    375     pop         rbp
    376     ret
    377 
    378 
    379 %macro MB_FILTER_AND_WRITEBACK 1
    380         movdqa      xmm3,                   [GLOBAL(t80)]
    381 %if %1 == 0
    382         movdqa      xmm2,                   [rsp+_p1]              ; p1
    383         movdqa      xmm7,                   [rsp+_q1]              ; q1
    384 %elif %1 == 1
    385         movdqa      xmm2,                   [rsi+2*rax]     ; p1
    386         movdqa      xmm7,                   [rdi]           ; q1
    387 
    388         mov         rcx,                    rax
    389         neg         rcx
    390 %elif %1 == 2
    391         movdqa      xmm2,                   [rsp+_p1]       ; p1
    392         movdqa      xmm6,                   [rsp+_p0]       ; p0
    393         movdqa      xmm0,                   [rsp+_q0]       ; q0
    394         movdqa      xmm7,                   [rsp+_q1]       ; q1
    395 %endif
    396 
    397         pxor        xmm2,                   xmm3            ; p1 offset to convert to signed values
    398         pxor        xmm7,                   xmm3            ; q1 offset to convert to signed values
    399         pxor        xmm6,                   xmm3            ; offset to convert to signed values
    400         pxor        xmm0,                   xmm3            ; offset to convert to signed values
    401 
    402         psubsb      xmm2,                   xmm7            ; p1 - q1
    403 
    404         movdqa      xmm3,                   xmm0            ; q0
    405         psubsb      xmm0,                   xmm6            ; q0 - p0
    406         paddsb      xmm2,                   xmm0            ; 1 * (q0 - p0) + (p1 - q1)
    407         paddsb      xmm2,                   xmm0            ; 2 * (q0 - p0)
    408         paddsb      xmm2,                   xmm0            ; 3 * (q0 - p0) + (p1 - q1)
    409         pand        xmm1,                   xmm2            ; mask filter values we don't care about
    410 
    411         movdqa      xmm2,                   xmm1            ; vp8_filter
    412 
    413         pand        xmm2,                   xmm4            ; Filter2 = vp8_filter & hev
    414         pxor        xmm0,                   xmm0
    415 
    416         pandn       xmm4,                   xmm1            ; vp8_filter&=~hev
    417         pxor        xmm1,                   xmm1
    418 
    419         punpcklbw   xmm0,                   xmm4            ; Filter 2 (hi)
    420         punpckhbw   xmm1,                   xmm4            ; Filter 2 (lo)
    421 
    422         movdqa      xmm5,                   xmm2
    423 
    424         movdqa      xmm4,                   [GLOBAL(s9)]
    425         paddsb      xmm5,                   [GLOBAL(t3)]    ; vp8_signed_char_clamp(Filter2 + 3)
    426         paddsb      xmm2,                   [GLOBAL(t4)]    ; vp8_signed_char_clamp(Filter2 + 4)
    427 
    428         pmulhw      xmm1,                   xmm4            ; Filter 2 (lo) * 9
    429         pmulhw      xmm0,                   xmm4            ; Filter 2 (hi) * 9
    430 
    431         punpckhbw   xmm7,                   xmm5            ; axbxcxdx
    432         punpcklbw   xmm5,                   xmm5            ; exfxgxhx
    433 
    434         psraw       xmm7,                   11              ; sign extended shift right by 3
    435 
    436         psraw       xmm5,                   11              ; sign extended shift right by 3
    437         punpckhbw   xmm4,                   xmm2            ; axbxcxdx
    438 
    439         punpcklbw   xmm2,                   xmm2            ; exfxgxhx
    440         psraw       xmm4,                   11              ; sign extended shift right by 3
    441 
    442         packsswb    xmm5,                   xmm7            ; Filter2 >>=3;
    443         psraw       xmm2,                   11              ; sign extended shift right by 3
    444 
    445         packsswb    xmm2,                   xmm4            ; Filter1 >>=3;
    446 
    447         paddsb      xmm6,                   xmm5            ; ps0 =ps0 + Fitler2
    448 
    449         psubsb      xmm3,                   xmm2            ; qs0 =qs0 - Filter1
    450         movdqa      xmm7,                   xmm1
    451 
    452         movdqa      xmm4,                   [GLOBAL(s63)]
    453         movdqa      xmm5,                   xmm0
    454         movdqa      xmm2,                   xmm5
    455         paddw       xmm0,                   xmm4            ; Filter 2 (hi) * 9 + 63
    456         paddw       xmm1,                   xmm4            ; Filter 2 (lo) * 9 + 63
    457         movdqa      xmm4,                   xmm7
    458 
    459         paddw       xmm5,                   xmm5            ; Filter 2 (hi) * 18
    460 
    461         paddw       xmm7,                   xmm7            ; Filter 2 (lo) * 18
    462         paddw       xmm5,                   xmm0            ; Filter 2 (hi) * 27 + 63
    463 
    464         paddw       xmm7,                   xmm1            ; Filter 2 (lo) * 27 + 63
    465         paddw       xmm2,                   xmm0            ; Filter 2 (hi) * 18 + 63
    466         psraw       xmm0,                   7               ; (Filter 2 (hi) * 9 + 63) >> 7
    467 
    468         paddw       xmm4,                   xmm1            ; Filter 2 (lo) * 18 + 63
    469         psraw       xmm1,                   7               ; (Filter 2 (lo) * 9 + 63) >> 7
    470         psraw       xmm2,                   7               ; (Filter 2 (hi) * 18 + 63) >> 7
    471 
    472         packsswb    xmm0,                   xmm1            ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
    473 
    474         psraw       xmm4,                   7               ; (Filter 2 (lo) * 18 + 63) >> 7
    475         psraw       xmm5,                   7               ; (Filter 2 (hi) * 27 + 63) >> 7
    476         psraw       xmm7,                   7               ; (Filter 2 (lo) * 27 + 63) >> 7
    477 
    478         packsswb    xmm5,                   xmm7            ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
    479         packsswb    xmm2,                   xmm4            ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
    480         movdqa      xmm7,                   [GLOBAL(t80)]
    481 
    482 %if %1 == 0
    483         movdqa      xmm1,                   [rsp+_q1]       ; q1
    484         movdqa      xmm4,                   [rsp+_p1]       ; p1
    485         lea         rsi,                    [rsi+rcx*2]
    486         lea         rdi,                    [rdi+rcx*2]
    487 
    488 %elif %1 == 1
    489         movdqa      xmm1,                   [rdi]           ; q1
    490         movdqa      xmm4,                   [rsi+rax*2]     ; p1
    491 %elif %1 == 2
    492         movdqa      xmm4,                   [rsp+_p1]       ; p1
    493         movdqa      xmm1,                   [rsp+_q1]       ; q1
    494 %endif
    495 
    496         pxor        xmm1,                   xmm7
    497         pxor        xmm4,                   xmm7
    498 
    499         psubsb      xmm3,                   xmm5            ; sq = vp8_signed_char_clamp(qs0 - u3)
    500         paddsb      xmm6,                   xmm5            ; sp = vp8_signed_char_clamp(ps0 - u3)
    501         psubsb      xmm1,                   xmm2            ; sq = vp8_signed_char_clamp(qs1 - u2)
    502         paddsb      xmm4,                   xmm2            ; sp = vp8_signed_char_clamp(ps1 - u2)
    503 
    504 %if %1 == 1
    505         movdqa      xmm2,                   [rdi+rax*4]     ; p2
    506         movdqa      xmm5,                   [rdi+rcx]       ; q2
    507 %else
    508         movdqa      xmm2,                   [rsp+_p2]       ; p2
    509         movdqa      xmm5,                   [rsp+_q2]       ; q2
    510 %endif
    511 
    512         pxor        xmm1,                   xmm7            ; *oq1 = sq^0x80;
    513         pxor        xmm4,                   xmm7            ; *op1 = sp^0x80;
    514         pxor        xmm2,                   xmm7
    515         pxor        xmm5,                   xmm7
    516         paddsb      xmm2,                   xmm0            ; sp = vp8_signed_char_clamp(ps2 - u)
    517         psubsb      xmm5,                   xmm0            ; sq = vp8_signed_char_clamp(qs2 - u)
    518         pxor        xmm2,                   xmm7            ; *op2 = sp^0x80;
    519         pxor        xmm5,                   xmm7            ; *oq2 = sq^0x80;
    520         pxor        xmm3,                   xmm7            ; *oq0 = sq^0x80
    521         pxor        xmm6,                   xmm7            ; *oq0 = sp^0x80
    522 %if %1 == 0
    523         movq        [rsi],                  xmm6            ; p0
    524         movhps      [rdi],                  xmm6
    525         movq        [rsi + rcx],            xmm3            ; q0
    526         movhps      [rdi + rcx],            xmm3
    527         lea         rdx,                    [rcx + rcx*2]
    528         movq        [rsi+rcx*2],            xmm1            ; q1
    529         movhps      [rdi+rcx*2],            xmm1
    530 
    531         movq        [rsi + rax],            xmm4            ; p1
    532         movhps      [rdi + rax],            xmm4
    533 
    534         movq        [rsi+rax*2],            xmm2            ; p2
    535         movhps      [rdi+rax*2],            xmm2
    536 
    537         movq        [rsi+rdx],              xmm5            ; q2
    538         movhps      [rdi+rdx],              xmm5
    539 %elif %1 == 1
    540         movdqa      [rdi+rcx],              xmm5            ; q2
    541         movdqa      [rdi],                  xmm1            ; q1
    542         movdqa      [rsi],                  xmm3            ; q0
    543         movdqa      [rsi+rax  ],            xmm6            ; p0
    544         movdqa      [rsi+rax*2],            xmm4            ; p1
    545         movdqa      [rdi+rax*4],            xmm2            ; p2
    546 %elif %1 == 2
    547         movdqa      [rsp+_p1],              xmm4            ; p1
    548         movdqa      [rsp+_p0],              xmm6            ; p0
    549         movdqa      [rsp+_q0],              xmm3            ; q0
    550         movdqa      [rsp+_q1],              xmm1            ; q1
    551 %endif
    552 
    553 %endmacro
    554 
    555 
    556 ;void vp8_mbloop_filter_horizontal_edge_sse2
    557 ;(
    558 ;    unsigned char *src_ptr,
    559 ;    int            src_pixel_step,
    560 ;    const char    *blimit,
    561 ;    const char    *limit,
    562 ;    const char    *thresh,
    563 ;)
    564 global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE
    565 sym(vp8_mbloop_filter_horizontal_edge_sse2):
    566     push        rbp
    567     mov         rbp, rsp
    568     SHADOW_ARGS_TO_STACK 5
    569     SAVE_XMM 7
    570     GET_GOT     rbx
    571     push        rsi
    572     push        rdi
    573     ; end prolog
    574 
    575     ALIGN_STACK 16, rax
    576     sub         rsp, lf_var_size
    577 
    578         mov         rsi,                    arg(0)            ;src_ptr
    579         movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step
    580         mov         rdx,                    arg(3)            ;limit
    581 
    582         lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing
    583 
    584         ; calculate breakout conditions and high edge variance
    585         LFH_FILTER_AND_HEV_MASK 1
    586         ; filter and write back the results
    587         MB_FILTER_AND_WRITEBACK 1
    588 
    589     add rsp, lf_var_size
    590     pop rsp
    591     ; begin epilog
    592     pop rdi
    593     pop rsi
    594     RESTORE_GOT
    595     RESTORE_XMM
    596     UNSHADOW_ARGS
    597     pop         rbp
    598     ret
    599 
    600 
    601 ;void vp8_mbloop_filter_horizontal_edge_uv_sse2
    602 ;(
    603 ;    unsigned char *u,
    604 ;    int            src_pixel_step,
    605 ;    const char    *blimit,
    606 ;    const char    *limit,
    607 ;    const char    *thresh,
    608 ;    unsigned char *v
    609 ;)
    610 global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE
    611 sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
    612     push        rbp
    613     mov         rbp, rsp
    614     SHADOW_ARGS_TO_STACK 6
    615     SAVE_XMM 7
    616     GET_GOT     rbx
    617     push        rsi
    618     push        rdi
    619     ; end prolog
    620 
    621     ALIGN_STACK 16, rax
    622     sub         rsp, lf_var_size
    623 
    624         mov         rsi,                    arg(0)             ; u
    625         mov         rdi,                    arg(5)             ; v
    626         movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
    627         mov         rcx,                    rax
    628         neg         rax                     ; negate pitch to deal with above border
    629         mov         rdx,                    arg(3)             ;limit
    630 
    631         lea         rsi,                    [rsi + rcx]
    632         lea         rdi,                    [rdi + rcx]
    633 
    634         ; calculate breakout conditions and high edge variance
    635         LFH_FILTER_AND_HEV_MASK 0
    636         ; filter and write back the results
    637         MB_FILTER_AND_WRITEBACK 0
    638 
    639     add rsp, lf_var_size
    640     pop rsp
    641     ; begin epilog
    642     pop rdi
    643     pop rsi
    644     RESTORE_GOT
    645     RESTORE_XMM
    646     UNSHADOW_ARGS
    647     pop         rbp
    648     ret
    649 
    650 
    651 %macro TRANSPOSE_16X8 2
    652         movq        xmm4,               [rsi]           ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
    653         movq        xmm1,               [rdi]           ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
    654         movq        xmm0,               [rsi+2*rax]     ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
    655         movq        xmm7,               [rdi+2*rax]     ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
    656         movq        xmm5,               [rsi+4*rax]     ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
    657         movq        xmm2,               [rdi+4*rax]     ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
    658 
    659         punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
    660 
    661         movq        xmm1,               [rdi+2*rcx]     ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
    662 
    663         movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
    664         punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
    665 
    666         movq        xmm7,               [rsi+2*rcx]     ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
    667 
    668         punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
    669 %if %1
    670         lea         rsi,                [rsi+rax*8]
    671         lea         rdi,                [rdi+rax*8]
    672 %else
    673         mov         rsi,                arg(5)          ; v_ptr
    674 %endif
    675 
    676         movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
    677         punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
    678         punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
    679         punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
    680         punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
    681 
    682 %if %1 == 0
    683         lea         rdi,                [rsi + rax - 4] ; rdi points to row +1 for indirect addressing
    684         lea         rsi,                [rsi - 4]
    685 %endif
    686 
    687         movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
    688         punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
    689 
    690         movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
    691         punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
    692 
    693         punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
    694 
    695         punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
    696 
    697         punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
    698 
    699         movdqa      [rsp+_t0],          xmm2            ; save to free XMM2
    700 
    701         movq        xmm2,               [rsi]           ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
    702         movq        xmm6,               [rdi]           ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
    703         movq        xmm0,               [rsi+2*rax]     ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
    704         movq        xmm5,               [rdi+2*rax]     ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
    705         movq        xmm1,               [rsi+4*rax]     ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
    706 
    707         punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
    708 
    709         movq        xmm6,               [rdi+4*rax]     ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
    710 
    711         punpcklbw   xmm0,               xmm5            ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
    712 
    713         movq        xmm5,               [rsi+2*rcx]     ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
    714 
    715         punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
    716 
    717         movq        xmm6,               [rdi+2*rcx]     ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
    718 
    719         punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
    720 
    721         movdqa      xmm6,               xmm1            ;
    722         punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
    723 
    724         punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
    725         movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
    726 
    727         punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
    728 
    729         punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
    730 
    731         movdqa      xmm0,               xmm5
    732         punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
    733 
    734         punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
    735         movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
    736 
    737         punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
    738 
    739         punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
    740         movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
    741 
    742         punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
    743 
    744         punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
    745 
    746 %if %2 == 0
    747         movdqa      [rsp+_q3],          xmm7            ; save 7
    748         movdqa      [rsp+_q2],          xmm6            ; save 6
    749 %endif
    750         movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
    751         punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
    752         punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
    753         movdqa      [rsp+_p1],          xmm2            ; save 2
    754 
    755         movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
    756         punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
    757         movdqa      [rsp+_p0],          xmm3            ; save 3
    758 
    759         punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
    760 
    761         movdqa      [rsp+_q0],          xmm4            ; save 4
    762         movdqa      [rsp+_q1],          xmm5            ; save 5
    763         movdqa      xmm1,               [rsp+_t0]
    764 
    765         movdqa      xmm2,               xmm1            ;
    766         punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
    767         punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
    768 
    769 %if %2 == 0
    770         movdqa      [rsp+_p2],          xmm1
    771         movdqa      [rsp+_p3],          xmm2
    772 %endif
    773 
    774 %endmacro
    775 
    776 %macro LFV_FILTER_MASK_HEV_MASK 0
    777         movdqa      xmm0,               xmm6            ; q2
    778         psubusb     xmm0,               xmm7            ; q2-q3
    779 
    780         psubusb     xmm7,               xmm6            ; q3-q2
    781         movdqa      xmm4,               xmm5            ; q1
    782 
    783         por         xmm7,               xmm0            ; abs (q3-q2)
    784         psubusb     xmm4,               xmm6            ; q1-q2
    785 
    786         movdqa      xmm0,               xmm1
    787         psubusb     xmm6,               xmm5            ; q2-q1
    788 
    789         por         xmm6,               xmm4            ; abs (q2-q1)
    790         psubusb     xmm0,               xmm2            ; p2 - p3;
    791 
    792         psubusb     xmm2,               xmm1            ; p3 - p2;
    793         por         xmm0,               xmm2            ; abs(p2-p3)
    794 
    795         movdqa      xmm5,               [rsp+_p1]       ; p1
    796         pmaxub      xmm0,               xmm7
    797 
    798         movdqa      xmm2,               xmm5            ; p1
    799         psubusb     xmm5,               xmm1            ; p1-p2
    800         psubusb     xmm1,               xmm2            ; p2-p1
    801 
    802         movdqa      xmm7,               xmm3            ; p0
    803         psubusb     xmm7,               xmm2            ; p0-p1
    804 
    805         por         xmm1,               xmm5            ; abs(p2-p1)
    806         pmaxub      xmm0,               xmm6
    807 
    808         pmaxub      xmm0,               xmm1
    809         movdqa      xmm1,               xmm2            ; p1
    810 
    811         psubusb     xmm2,               xmm3            ; p1-p0
    812 
    813         por         xmm2,               xmm7            ; abs(p1-p0)
    814 
    815         pmaxub      xmm0,               xmm2
    816 
    817         movdqa      xmm5,               [rsp+_q0]       ; q0
    818         movdqa      xmm7,               [rsp+_q1]       ; q1
    819 
    820         mov         rdx,                arg(3)          ; limit
    821 
    822         movdqa      xmm6,               xmm5            ; q0
    823         movdqa      xmm4,               xmm7            ; q1
    824 
    825         psubusb     xmm5,               xmm7            ; q0-q1
    826         psubusb     xmm7,               xmm6            ; q1-q0
    827 
    828         por         xmm7,               xmm5            ; abs(q1-q0)
    829 
    830         pmaxub      xmm0,               xmm7
    831 
    832         psubusb     xmm0,               [rdx]           ; limit
    833 
    834         mov         rdx,                arg(2)          ; blimit
    835         movdqa      xmm5,               xmm4            ; q1
    836 
    837         psubusb     xmm5,               xmm1            ; q1-=p1
    838         psubusb     xmm1,               xmm4            ; p1-=q1
    839 
    840         por         xmm5,               xmm1            ; abs(p1-q1)
    841         movdqa      xmm1,               xmm3            ; p0
    842 
    843         pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
    844         psubusb     xmm1,               xmm6            ; p0-q0
    845 
    846         movdqa      xmm4,               [rdx]           ; blimit
    847         mov         rdx,                arg(4)          ; get thresh
    848 
    849         psrlw       xmm5,               1               ; abs(p1-q1)/2
    850         psubusb     xmm6,               xmm3            ; q0-p0
    851 
    852         por         xmm1,               xmm6            ; abs(q0-p0)
    853         paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
    854         movdqa      xmm3,               [rdx]
    855 
    856         paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
    857         psubusb     xmm2,               xmm3            ; abs(q1 - q0) > thresh
    858 
    859         psubusb     xmm7,               xmm3            ; abs(p1 - p0)> thresh
    860 
    861         psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
    862         por         xmm2,               xmm7            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
    863 
    864         por         xmm1,               xmm0            ; mask
    865         pcmpeqb     xmm2,               xmm0
    866 
    867         pxor        xmm0,               xmm0
    868         pcmpeqb     xmm4,               xmm4
    869 
    870         pcmpeqb     xmm1,               xmm0
    871         pxor        xmm4,               xmm2
    872 %endmacro
    873 
    874 %macro BV_TRANSPOSE 0
    875         ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
    876         ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
    877         ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
    878         ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
    879         movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
    880         punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
    881 
    882         movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
    883         punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
    884 
    885         punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
    886 
    887         punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
    888 
    889         movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
    890         punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
    891 
    892         punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
    893         movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
    894 
    895         punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
    896 
    897         punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
    898         ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
    899         ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
    900         ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
    901         ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
    902 %endmacro
    903 
    904 %macro BV_WRITEBACK 2
    905         movd        [rsi+2],            %1
    906         movd        [rsi+4*rax+2],      %2
    907         psrldq      %1,                 4
    908         psrldq      %2,                 4
    909         movd        [rdi+2],            %1
    910         movd        [rdi+4*rax+2],      %2
    911         psrldq      %1,                 4
    912         psrldq      %2,                 4
    913         movd        [rsi+2*rax+2],      %1
    914         movd        [rsi+2*rcx+2],      %2
    915         psrldq      %1,                 4
    916         psrldq      %2,                 4
    917         movd        [rdi+2*rax+2],      %1
    918         movd        [rdi+2*rcx+2],      %2
    919 %endmacro
    920 
    921 %if ABI_IS_32BIT
    922 
    923 ;void vp8_loop_filter_vertical_edge_sse2
    924 ;(
    925 ;    unsigned char *src_ptr,
    926 ;    int            src_pixel_step,
    927 ;    const char    *blimit,
    928 ;    const char    *limit,
    929 ;    const char    *thresh,
    930 ;)
    931 global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE
    932 sym(vp8_loop_filter_vertical_edge_sse2):
    933     push        rbp
    934     mov         rbp, rsp
    935     SHADOW_ARGS_TO_STACK 5
    936     SAVE_XMM 7
    937     GET_GOT     rbx
    938     push        rsi
    939     push        rdi
    940     ; end prolog
    941 
    942     ALIGN_STACK 16, rax
    943     sub             rsp, lf_var_size
    944 
    945         mov         rsi,        arg(0)                  ; src_ptr
    946         movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
    947 
    948         lea         rsi,        [rsi - 4]
    949         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
    950         lea         rcx,        [rax*2+rax]
    951 
    952         ;transpose 16x8 to 8x16, and store the 8-line result on stack.
    953         TRANSPOSE_16X8 1, 1
    954 
    955         ; calculate filter mask and high edge variance
    956         LFV_FILTER_MASK_HEV_MASK
    957 
    958         ; start work on filters
    959         B_FILTER 2
    960 
    961         ; tranpose and write back - only work on q1, q0, p0, p1
    962         BV_TRANSPOSE
    963         ; store 16-line result
    964 
    965         lea         rdx,        [rax]
    966         neg         rdx
    967 
    968         BV_WRITEBACK xmm1, xmm5
    969 
    970         lea         rsi,        [rsi+rdx*8]
    971         lea         rdi,        [rdi+rdx*8]
    972         BV_WRITEBACK xmm2, xmm6
    973 
    974     add rsp, lf_var_size
    975     pop rsp
    976     ; begin epilog
    977     pop rdi
    978     pop rsi
    979     RESTORE_GOT
    980     RESTORE_XMM
    981     UNSHADOW_ARGS
    982     pop         rbp
    983     ret
    984 
    985 %endif
    986 
    987 ;void vp8_loop_filter_vertical_edge_uv_sse2
    988 ;(
    989 ;    unsigned char *u,
    990 ;    int            src_pixel_step,
    991 ;    const char    *blimit,
    992 ;    const char    *limit,
    993 ;    const char    *thresh,
    994 ;    unsigned char *v
    995 ;)
    996 global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE
    997 sym(vp8_loop_filter_vertical_edge_uv_sse2):
    998     push        rbp
    999     mov         rbp, rsp
   1000     SHADOW_ARGS_TO_STACK 6
   1001     SAVE_XMM 7
   1002     GET_GOT     rbx
   1003     push        rsi
   1004     push        rdi
   1005     ; end prolog
   1006 
   1007     ALIGN_STACK 16, rax
   1008     sub             rsp, lf_var_size
   1009 
   1010         mov         rsi,        arg(0)                  ; u_ptr
   1011         movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
   1012 
   1013         lea         rsi,        [rsi - 4]
   1014         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
   1015         lea         rcx,        [rax+2*rax]
   1016 
   1017         ;transpose 16x8 to 8x16, and store the 8-line result on stack.
   1018         TRANSPOSE_16X8 0, 1
   1019 
   1020         ; calculate filter mask and high edge variance
   1021         LFV_FILTER_MASK_HEV_MASK
   1022 
   1023         ; start work on filters
   1024         B_FILTER 2
   1025 
   1026         ; tranpose and write back - only work on q1, q0, p0, p1
   1027         BV_TRANSPOSE
   1028 
   1029         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
   1030 
   1031         ; store 16-line result
   1032         BV_WRITEBACK xmm1, xmm5
   1033 
   1034         mov         rsi,        arg(0)                  ; u_ptr
   1035         lea         rsi,        [rsi - 4]
   1036         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
   1037         BV_WRITEBACK xmm2, xmm6
   1038 
   1039     add rsp, lf_var_size
   1040     pop rsp
   1041     ; begin epilog
   1042     pop rdi
   1043     pop rsi
   1044     RESTORE_GOT
   1045     RESTORE_XMM
   1046     UNSHADOW_ARGS
   1047     pop         rbp
   1048     ret
   1049 
   1050 %macro MBV_TRANSPOSE 0
   1051         movdqa      xmm0,               [rsp+_p3]           ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
   1052         movdqa      xmm1,               xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
   1053 
   1054         punpcklbw   xmm0,               xmm2                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
   1055         punpckhbw   xmm1,               xmm2                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
   1056 
   1057         movdqa      xmm7,               [rsp+_p1]           ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1058         movdqa      xmm6,               xmm7                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1059 
   1060         punpcklbw   xmm7,               [rsp+_p0]           ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
   1061         punpckhbw   xmm6,               [rsp+_p0]           ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
   1062 
   1063         movdqa      xmm3,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
   1064         punpcklwd   xmm0,               xmm7                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
   1065 
   1066         punpckhwd   xmm3,               xmm7                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
   1067         movdqa      xmm4,               xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
   1068 
   1069         punpcklwd   xmm1,               xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
   1070         punpckhwd   xmm4,               xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
   1071 
   1072         movdqa      xmm7,               [rsp+_q0]           ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
   1073         punpcklbw   xmm7,               [rsp+_q1]           ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
   1074 
   1075         movdqa      xmm6,               xmm5                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
   1076         punpcklbw   xmm6,               [rsp+_q3]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
   1077 
   1078         movdqa      xmm2,               xmm7                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
   1079         punpcklwd   xmm7,               xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
   1080 
   1081         punpckhwd   xmm2,               xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
   1082         movdqa      xmm6,               xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
   1083 
   1084         punpckldq   xmm0,               xmm7                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
   1085         punpckhdq   xmm6,               xmm7                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
   1086 %endmacro
   1087 
   1088 %macro MBV_WRITEBACK_1 0
   1089         movq        [rsi],              xmm0
   1090         movhps      [rdi],              xmm0
   1091 
   1092         movq        [rsi+2*rax],        xmm6
   1093         movhps      [rdi+2*rax],        xmm6
   1094 
   1095         movdqa      xmm0,               xmm3                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
   1096         punpckldq   xmm0,               xmm2                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
   1097         punpckhdq   xmm3,               xmm2                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
   1098 
   1099         movq        [rsi+4*rax],        xmm0
   1100         movhps      [rdi+4*rax],        xmm0
   1101 
   1102         movq        [rsi+2*rcx],        xmm3
   1103         movhps      [rdi+2*rcx],        xmm3
   1104 
   1105         movdqa      xmm7,               [rsp+_q0]           ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
   1106         punpckhbw   xmm7,               [rsp+_q1]           ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
   1107         punpckhbw   xmm5,               [rsp+_q3]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
   1108 
   1109         movdqa      xmm0,               xmm7
   1110         punpcklwd   xmm0,               xmm5                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
   1111         punpckhwd   xmm7,               xmm5                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
   1112 
   1113         movdqa      xmm5,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
   1114         punpckldq   xmm1,               xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
   1115         punpckhdq   xmm5,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
   1116 %endmacro
   1117 
   1118 %macro MBV_WRITEBACK_2 0
   1119         movq        [rsi],              xmm1
   1120         movhps      [rdi],              xmm1
   1121 
   1122         movq        [rsi+2*rax],        xmm5
   1123         movhps      [rdi+2*rax],        xmm5
   1124 
   1125         movdqa      xmm1,               xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
   1126         punpckldq   xmm1,               xmm7                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
   1127         punpckhdq   xmm4,               xmm7                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
   1128 
   1129         movq        [rsi+4*rax],        xmm1
   1130         movhps      [rdi+4*rax],        xmm1
   1131 
   1132         movq        [rsi+2*rcx],        xmm4
   1133         movhps      [rdi+2*rcx],        xmm4
   1134 %endmacro
   1135 
   1136 
   1137 ;void vp8_mbloop_filter_vertical_edge_sse2
   1138 ;(
   1139 ;    unsigned char *src_ptr,
   1140 ;    int            src_pixel_step,
   1141 ;    const char    *blimit,
   1142 ;    const char    *limit,
   1143 ;    const char    *thresh,
   1144 ;)
   1145 global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE
   1146 sym(vp8_mbloop_filter_vertical_edge_sse2):
   1147     push        rbp
   1148     mov         rbp, rsp
   1149     SHADOW_ARGS_TO_STACK 5
   1150     SAVE_XMM 7
   1151     GET_GOT     rbx
   1152     push        rsi
   1153     push        rdi
   1154     ; end prolog
   1155 
   1156     ALIGN_STACK 16, rax
   1157     sub          rsp, lf_var_size
   1158 
   1159         mov         rsi,                arg(0)              ; src_ptr
   1160         movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
   1161 
   1162         lea         rsi,                [rsi - 4]
   1163         lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
   1164         lea         rcx,                [rax*2+rax]
   1165 
   1166         ; Transpose
   1167         TRANSPOSE_16X8 1, 0
   1168 
   1169         ; calculate filter mask and high edge variance
   1170         LFV_FILTER_MASK_HEV_MASK
   1171 
   1172         neg         rax
   1173         ; start work on filters
   1174         MB_FILTER_AND_WRITEBACK 2
   1175 
   1176         lea         rsi,                [rsi+rax*8]
   1177         lea         rdi,                [rdi+rax*8]
   1178 
   1179         ; transpose and write back
   1180         MBV_TRANSPOSE
   1181 
   1182         neg         rax
   1183 
   1184         MBV_WRITEBACK_1
   1185 
   1186 
   1187         lea         rsi,                [rsi+rax*8]
   1188         lea         rdi,                [rdi+rax*8]
   1189         MBV_WRITEBACK_2
   1190 
   1191     add rsp, lf_var_size
   1192     pop rsp
   1193     ; begin epilog
   1194     pop rdi
   1195     pop rsi
   1196     RESTORE_GOT
   1197     RESTORE_XMM
   1198     UNSHADOW_ARGS
   1199     pop         rbp
   1200     ret
   1201 
   1202 
   1203 ;void vp8_mbloop_filter_vertical_edge_uv_sse2
   1204 ;(
   1205 ;    unsigned char *u,
   1206 ;    int            src_pixel_step,
   1207 ;    const char    *blimit,
   1208 ;    const char    *limit,
   1209 ;    const char    *thresh,
   1210 ;    unsigned char *v
   1211 ;)
   1212 global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE
   1213 sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
   1214     push        rbp
   1215     mov         rbp, rsp
   1216     SHADOW_ARGS_TO_STACK 6
   1217     SAVE_XMM 7
   1218     GET_GOT     rbx
   1219     push        rsi
   1220     push        rdi
   1221     ; end prolog
   1222 
   1223     ALIGN_STACK 16, rax
   1224     sub          rsp, lf_var_size
   1225 
   1226         mov         rsi,                arg(0)              ; u_ptr
   1227         movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
   1228 
   1229         lea         rsi,                [rsi - 4]
   1230         lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
   1231         lea         rcx,                [rax+2*rax]
   1232 
   1233         ; Transpose
   1234         TRANSPOSE_16X8 0, 0
   1235 
   1236         ; calculate filter mask and high edge variance
   1237         LFV_FILTER_MASK_HEV_MASK
   1238 
   1239         ; start work on filters
   1240         MB_FILTER_AND_WRITEBACK 2
   1241 
   1242         ; transpose and write back
   1243         MBV_TRANSPOSE
   1244 
   1245         mov         rsi,                arg(0)             ;u_ptr
   1246         lea         rsi,                [rsi - 4]
   1247         lea         rdi,                [rsi + rax]
   1248         MBV_WRITEBACK_1
   1249         mov         rsi,                arg(5)             ;v_ptr
   1250         lea         rsi,                [rsi - 4]
   1251         lea         rdi,                [rsi + rax]
   1252         MBV_WRITEBACK_2
   1253 
   1254     add rsp, lf_var_size
   1255     pop rsp
   1256     ; begin epilog
   1257     pop rdi
   1258     pop rsi
   1259     RESTORE_GOT
   1260     RESTORE_XMM
   1261     UNSHADOW_ARGS
   1262     pop         rbp
   1263     ret
   1264 
   1265 
   1266 ;void vp8_loop_filter_simple_horizontal_edge_sse2
   1267 ;(
   1268 ;    unsigned char *src_ptr,
   1269 ;    int  src_pixel_step,
   1270 ;    const char *blimit,
   1271 ;)
   1272 global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE
   1273 sym(vp8_loop_filter_simple_horizontal_edge_sse2):
   1274     push        rbp
   1275     mov         rbp, rsp
   1276     SHADOW_ARGS_TO_STACK 3
   1277     SAVE_XMM 7
   1278     GET_GOT     rbx
   1279     ; end prolog
   1280 
   1281         mov         rcx, arg(0)             ;src_ptr
   1282         movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
   1283         movdqa      xmm6, [GLOBAL(tfe)]
   1284         lea         rdx, [rcx + rax]
   1285         neg         rax
   1286 
   1287         ; calculate mask
   1288         movdqa      xmm0, [rdx]             ; q1
   1289         mov         rdx, arg(2)             ;blimit
   1290         movdqa      xmm1, [rcx+2*rax]       ; p1
   1291 
   1292         movdqa      xmm2, xmm1
   1293         movdqa      xmm3, xmm0
   1294 
   1295         psubusb     xmm0, xmm1              ; q1-=p1
   1296         psubusb     xmm1, xmm3              ; p1-=q1
   1297         por         xmm1, xmm0              ; abs(p1-q1)
   1298         pand        xmm1, xmm6              ; set lsb of each byte to zero
   1299         psrlw       xmm1, 1                 ; abs(p1-q1)/2
   1300 
   1301         movdqa      xmm7, XMMWORD PTR [rdx]
   1302 
   1303         movdqa      xmm5, [rcx+rax]         ; p0
   1304         movdqa      xmm4, [rcx]             ; q0
   1305         movdqa      xmm0, xmm4              ; q0
   1306         movdqa      xmm6, xmm5              ; p0
   1307         psubusb     xmm5, xmm4              ; p0-=q0
   1308         psubusb     xmm4, xmm6              ; q0-=p0
   1309         por         xmm5, xmm4              ; abs(p0 - q0)
   1310 
   1311         movdqa      xmm4, [GLOBAL(t80)]
   1312 
   1313         paddusb     xmm5, xmm5              ; abs(p0-q0)*2
   1314         paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
   1315         psubusb     xmm5, xmm7              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
   1316         pxor        xmm7, xmm7
   1317         pcmpeqb     xmm5, xmm7
   1318 
   1319 
   1320         ; start work on filters
   1321         pxor        xmm2, xmm4     ; p1 offset to convert to signed values
   1322         pxor        xmm3, xmm4     ; q1 offset to convert to signed values
   1323         psubsb      xmm2, xmm3              ; p1 - q1
   1324 
   1325         pxor        xmm6, xmm4     ; offset to convert to signed values
   1326         pxor        xmm0, xmm4     ; offset to convert to signed values
   1327         movdqa      xmm3, xmm0              ; q0
   1328         psubsb      xmm0, xmm6              ; q0 - p0
   1329         paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
   1330         paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
   1331         paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
   1332         pand        xmm5, xmm2              ; mask filter values we don't care about
   1333 
   1334         movdqa      xmm0, xmm5
   1335         paddsb      xmm5,        [GLOBAL(t3)]                  ;  3* (q0 - p0) + (p1 - q1) + 4
   1336         paddsb      xmm0,        [GLOBAL(t4)]                  ; +3 instead of +4
   1337 
   1338         movdqa      xmm1, [GLOBAL(te0)]
   1339         movdqa      xmm2, [GLOBAL(t1f)]
   1340 
   1341 ;        pxor        xmm7, xmm7
   1342         pcmpgtb     xmm7, xmm0              ;save sign
   1343         pand        xmm7, xmm1              ;preserve the upper 3 bits
   1344         psrlw       xmm0, 3
   1345         pand        xmm0, xmm2              ;clear out upper 3 bits
   1346         por         xmm0, xmm7              ;add sign
   1347         psubsb      xmm3, xmm0              ; q0-= q0sz add
   1348 
   1349         pxor        xmm7, xmm7
   1350         pcmpgtb     xmm7, xmm5              ;save sign
   1351         pand        xmm7, xmm1              ;preserve the upper 3 bits
   1352         psrlw       xmm5, 3
   1353         pand        xmm5, xmm2              ;clear out upper 3 bits
   1354         por         xmm5, xmm7              ;add sign
   1355         paddsb      xmm6, xmm5              ; p0+= p0 add
   1356 
   1357         pxor        xmm3, xmm4     ; unoffset
   1358         movdqa      [rcx], xmm3             ; write back
   1359 
   1360         pxor        xmm6, xmm4     ; unoffset
   1361         movdqa      [rcx+rax], xmm6         ; write back
   1362 
   1363     ; begin epilog
   1364     RESTORE_GOT
   1365     RESTORE_XMM
   1366     UNSHADOW_ARGS
   1367     pop         rbp
   1368     ret
   1369 
   1370 
   1371 ;void vp8_loop_filter_simple_vertical_edge_sse2
   1372 ;(
   1373 ;    unsigned char *src_ptr,
   1374 ;    int  src_pixel_step,
   1375 ;    const char *blimit,
   1376 ;)
   1377 global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE
   1378 sym(vp8_loop_filter_simple_vertical_edge_sse2):
   1379     push        rbp         ; save old base pointer value.
   1380     mov         rbp, rsp    ; set new base pointer value.
   1381     SHADOW_ARGS_TO_STACK 3
   1382     SAVE_XMM 7
   1383     GET_GOT     rbx         ; save callee-saved reg
   1384     push        rsi
   1385     push        rdi
   1386     ; end prolog
   1387 
   1388     ALIGN_STACK 16, rax
   1389     sub         rsp, 32                         ; reserve 32 bytes
   1390     %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
   1391     %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
   1392 
   1393         mov         rsi, arg(0) ;src_ptr
   1394         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
   1395 
   1396         lea         rsi,        [rsi - 2 ]
   1397         lea         rdi,        [rsi + rax]
   1398         lea         rdx,        [rsi + rax*4]
   1399         lea         rcx,        [rdx + rax]
   1400 
   1401         movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
   1402         movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
   1403         movd        xmm2,       [rdi]                   ; 13 12 11 10
   1404         movd        xmm3,       [rcx]                   ; 53 52 51 50
   1405         punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
   1406         punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
   1407 
   1408         movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20
   1409         movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60
   1410         movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30
   1411         movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70
   1412         punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
   1413         punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
   1414 
   1415         punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
   1416         punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
   1417 
   1418         movdqa      xmm1,       xmm0
   1419         punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
   1420         punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
   1421 
   1422         movdqa      xmm2,       xmm0
   1423         punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
   1424         punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
   1425 
   1426         lea         rsi,        [rsi + rax*8]
   1427         lea         rdi,        [rsi + rax]
   1428         lea         rdx,        [rsi + rax*4]
   1429         lea         rcx,        [rdx + rax]
   1430 
   1431         movd        xmm4,       [rsi]                   ; 83 82 81 80
   1432         movd        xmm1,       [rdx]                   ; c3 c2 c1 c0
   1433         movd        xmm6,       [rdi]                   ; 93 92 91 90
   1434         movd        xmm3,       [rcx]                   ; d3 d2 d1 d0
   1435         punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
   1436         punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
   1437 
   1438         movd        xmm1,       [rsi + rax*2]           ; a3 a2 a1 a0
   1439         movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
   1440         movd        xmm3,       [rdi + rax*2]           ; b3 b2 b1 b0
   1441         movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
   1442         punpckldq   xmm1,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
   1443         punpckldq   xmm3,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
   1444 
   1445         punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
   1446         punpcklbw   xmm1,       xmm3                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
   1447 
   1448         movdqa      xmm7,       xmm4
   1449         punpcklwd   xmm4,       xmm1                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
   1450         punpckhwd   xmm7,       xmm1                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
   1451 
   1452         movdqa      xmm6,       xmm4
   1453         punpckldq   xmm4,       xmm7                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
   1454         punpckhdq   xmm6,       xmm7                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
   1455 
   1456         movdqa      xmm1,       xmm0
   1457         movdqa      xmm3,       xmm2
   1458 
   1459         punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
   1460         punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
   1461         punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1462         punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
   1463 
   1464         mov         rdx,        arg(2)                          ;blimit
   1465 
   1466         ; calculate mask
   1467         movdqa      xmm6,       xmm0                            ; p1
   1468         movdqa      xmm7,       xmm3                            ; q1
   1469         psubusb     xmm7,       xmm0                            ; q1-=p1
   1470         psubusb     xmm6,       xmm3                            ; p1-=q1
   1471         por         xmm6,       xmm7                            ; abs(p1-q1)
   1472         pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
   1473         psrlw       xmm6,       1                               ; abs(p1-q1)/2
   1474 
   1475         movdqa      xmm7, [rdx]
   1476 
   1477         movdqa      xmm5,       xmm1                            ; p0
   1478         movdqa      xmm4,       xmm2                            ; q0
   1479         psubusb     xmm5,       xmm2                            ; p0-=q0
   1480         psubusb     xmm4,       xmm1                            ; q0-=p0
   1481         por         xmm5,       xmm4                            ; abs(p0 - q0)
   1482         paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
   1483         paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
   1484 
   1485         movdqa      xmm4, [GLOBAL(t80)]
   1486 
   1487         psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
   1488         pxor        xmm7,        xmm7
   1489         pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
   1490 
   1491         ; start work on filters
   1492         movdqa        t0,        xmm0
   1493         movdqa        t1,        xmm3
   1494 
   1495         pxor        xmm0,        xmm4                  ; p1 offset to convert to signed values
   1496         pxor        xmm3,        xmm4                  ; q1 offset to convert to signed values
   1497         psubsb      xmm0,        xmm3                           ; p1 - q1
   1498 
   1499         pxor        xmm1,        xmm4                  ; offset to convert to signed values
   1500         pxor        xmm2,        xmm4                  ; offset to convert to signed values
   1501 
   1502         movdqa      xmm3,        xmm2                           ; offseted ; q0
   1503         psubsb      xmm2,        xmm1                           ; q0 - p0
   1504         paddsb      xmm0,        xmm2                           ; p1 - q1 + 1 * (q0 - p0)
   1505         paddsb      xmm0,        xmm2                           ; p1 - q1 + 2 * (q0 - p0)
   1506         paddsb      xmm0,        xmm2                           ; p1 - q1 + 3 * (q0 - p0)
   1507         pand        xmm5,        xmm0                           ; mask filter values we don't care about
   1508 
   1509         movdqa      xmm0, xmm5
   1510         paddsb      xmm5,        [GLOBAL(t3)]                  ;  3* (q0 - p0) + (p1 - q1) + 4
   1511         paddsb      xmm0,        [GLOBAL(t4)]                  ; +3 instead of +4
   1512 
   1513         movdqa  xmm6, [GLOBAL(te0)]
   1514         movdqa  xmm2, [GLOBAL(t1f)]
   1515 
   1516 ;        pxor        xmm7, xmm7
   1517         pcmpgtb     xmm7, xmm0              ;save sign
   1518         pand        xmm7, xmm6              ;preserve the upper 3 bits
   1519         psrlw       xmm0, 3
   1520         pand        xmm0, xmm2              ;clear out upper 3 bits
   1521         por         xmm0, xmm7              ;add sign
   1522         psubsb      xmm3, xmm0              ; q0-= q0sz add
   1523 
   1524         pxor        xmm7, xmm7
   1525         pcmpgtb     xmm7, xmm5              ;save sign
   1526         pand        xmm7, xmm6              ;preserve the upper 3 bits
   1527         psrlw       xmm5, 3
   1528         pand        xmm5, xmm2              ;clear out upper 3 bits
   1529         por         xmm5, xmm7              ;add sign
   1530         paddsb      xmm1, xmm5              ; p0+= p0 add
   1531 
   1532         pxor        xmm3,        xmm4                  ; unoffset   q0
   1533         pxor        xmm1,        xmm4                  ; unoffset   p0
   1534 
   1535         movdqa      xmm0,        t0                             ; p1
   1536         movdqa      xmm4,        t1                             ; q1
   1537 
   1538         ; write out order: xmm0 xmm2 xmm1 xmm3
   1539         lea         rdx,        [rsi + rax*4]
   1540 
   1541         ; transpose back to write out
   1542         ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
   1543         ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
   1544         ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1545         ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
   1546         movdqa      xmm6,       xmm0
   1547         punpcklbw   xmm0,       xmm1                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
   1548         punpckhbw   xmm6,       xmm1                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
   1549 
   1550         movdqa      xmm5,       xmm3
   1551         punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
   1552         punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
   1553 
   1554         movdqa      xmm2,       xmm0
   1555         punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
   1556         punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
   1557 
   1558         movdqa      xmm3,       xmm6
   1559         punpcklwd   xmm6,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
   1560         punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
   1561 
   1562         movd        [rsi],      xmm6                               ; write the second 8-line result
   1563         movd        [rdx],      xmm3
   1564         psrldq      xmm6,       4
   1565         psrldq      xmm3,       4
   1566         movd        [rdi],      xmm6
   1567         movd        [rcx],      xmm3
   1568         psrldq      xmm6,       4
   1569         psrldq      xmm3,       4
   1570         movd        [rsi + rax*2], xmm6
   1571         movd        [rdx + rax*2], xmm3
   1572         psrldq      xmm6,       4
   1573         psrldq      xmm3,       4
   1574         movd        [rdi + rax*2], xmm6
   1575         movd        [rcx + rax*2], xmm3
   1576 
   1577         neg         rax
   1578         lea         rsi,        [rsi + rax*8]
   1579         neg         rax
   1580         lea         rdi,        [rsi + rax]
   1581         lea         rdx,        [rsi + rax*4]
   1582         lea         rcx,        [rdx + rax]
   1583 
   1584         movd        [rsi],      xmm0                                ; write the first 8-line result
   1585         movd        [rdx],      xmm2
   1586         psrldq      xmm0,       4
   1587         psrldq      xmm2,       4
   1588         movd        [rdi],      xmm0
   1589         movd        [rcx],      xmm2
   1590         psrldq      xmm0,       4
   1591         psrldq      xmm2,       4
   1592         movd        [rsi + rax*2], xmm0
   1593         movd        [rdx + rax*2], xmm2
   1594         psrldq      xmm0,       4
   1595         psrldq      xmm2,       4
   1596         movd        [rdi + rax*2], xmm0
   1597         movd        [rcx + rax*2], xmm2
   1598 
   1599     add rsp, 32
   1600     pop rsp
   1601     ; begin epilog
   1602     pop rdi
   1603     pop rsi
   1604     RESTORE_GOT
   1605     RESTORE_XMM
   1606     UNSHADOW_ARGS
   1607     pop         rbp
   1608     ret
   1609 
   1610 SECTION_RODATA
   1611 align 16
   1612 tfe:
   1613     times 16 db 0xfe
   1614 align 16
   1615 t80:
   1616     times 16 db 0x80
   1617 align 16
   1618 t1s:
   1619     times 16 db 0x01
   1620 align 16
   1621 t3:
   1622     times 16 db 0x03
   1623 align 16
   1624 t4:
   1625     times 16 db 0x04
   1626 align 16
   1627 ones:
   1628     times 8 dw 0x0001
   1629 align 16
   1630 s9:
   1631     times 8 dw 0x0900
   1632 align 16
   1633 s63:
   1634     times 8 dw 0x003f
   1635 align 16
   1636 te0:
   1637     times 16 db 0xe0
   1638 align 16
   1639 t1f:
   1640     times 16 db 0x1f
   1641