Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 %define _t0 0
     14 %define _t1 _t0 + 16
     15 %define _p3 _t1 + 16
     16 %define _p2 _p3 + 16
     17 %define _p1 _p2 + 16
     18 %define _p0 _p1 + 16
     19 %define _q0 _p0 + 16
     20 %define _q1 _q0 + 16
     21 %define _q2 _q1 + 16
     22 %define _q3 _q2 + 16
     23 %define lf_var_size 160
     24 
     25 ; Use of pmaxub instead of psubusb to compute filter mask was seen
     26 ; in ffvp8
     27 
     28 %macro LFH_FILTER_AND_HEV_MASK 1
     29 %if %1
     30         movdqa      xmm2,                   [rdi+2*rax]       ; q3
     31         movdqa      xmm1,                   [rsi+2*rax]       ; q2
     32         movdqa      xmm4,                   [rsi+rax]         ; q1
     33         movdqa      xmm5,                   [rsi]             ; q0
     34         neg         rax                     ; negate pitch to deal with above border
     35 %else
     36         movlps      xmm2,                   [rsi + rcx*2]     ; q3
     37         movlps      xmm1,                   [rsi + rcx]       ; q2
     38         movlps      xmm4,                   [rsi]             ; q1
     39         movlps      xmm5,                   [rsi + rax]       ; q0
     40 
     41         movhps      xmm2,                   [rdi + rcx*2]
     42         movhps      xmm1,                   [rdi + rcx]
     43         movhps      xmm4,                   [rdi]
     44         movhps      xmm5,                   [rdi + rax]
     45 
     46         lea         rsi,                    [rsi + rax*4]
     47         lea         rdi,                    [rdi + rax*4]
     48 
     49         movdqa      [rsp+_q2],              xmm1              ; store q2
     50         movdqa      [rsp+_q1],              xmm4              ; store q1
     51 %endif
     52         movdqa      xmm7,                   [rdx]             ;limit
     53 
     54         movdqa      xmm6,                   xmm1              ; q2
     55         movdqa      xmm3,                   xmm4              ; q1
     56 
     57         psubusb     xmm1,                   xmm2              ; q2-=q3
     58         psubusb     xmm2,                   xmm6              ; q3-=q2
     59 
     60         psubusb     xmm4,                   xmm6              ; q1-=q2
     61         psubusb     xmm6,                   xmm3              ; q2-=q1
     62 
     63         por         xmm4,                   xmm6              ; abs(q2-q1)
     64         por         xmm1,                   xmm2              ; abs(q3-q2)
     65 
     66         movdqa      xmm0,                   xmm5              ; q0
     67         pmaxub      xmm1,                   xmm4
     68 
     69         psubusb     xmm5,                   xmm3              ; q0-=q1
     70         psubusb     xmm3,                   xmm0              ; q1-=q0
     71 
     72         por         xmm5,                   xmm3              ; abs(q0-q1)
     73         movdqa      [rsp+_t0],              xmm5              ; save to t0
     74 
     75         pmaxub      xmm1,                   xmm5
     76 
     77 %if %1
     78         movdqa      xmm2,                   [rsi+4*rax]       ; p3
     79         movdqa      xmm4,                   [rdi+4*rax]       ; p2
     80         movdqa      xmm6,                   [rsi+2*rax]       ; p1
     81 %else
     82         movlps      xmm2,                   [rsi + rax]       ; p3
     83         movlps      xmm4,                   [rsi]             ; p2
     84         movlps      xmm6,                   [rsi + rcx]       ; p1
     85 
     86         movhps      xmm2,                   [rdi + rax]
     87         movhps      xmm4,                   [rdi]
     88         movhps      xmm6,                   [rdi + rcx]
     89 
     90         movdqa      [rsp+_p2],              xmm4              ; store p2
     91         movdqa      [rsp+_p1],              xmm6              ; store p1
     92 %endif
     93 
     94         movdqa      xmm5,                   xmm4              ; p2
     95         movdqa      xmm3,                   xmm6              ; p1
     96 
     97         psubusb     xmm4,                   xmm2              ; p2-=p3
     98         psubusb     xmm2,                   xmm5              ; p3-=p2
     99 
    100         psubusb     xmm3,                   xmm5              ; p1-=p2
    101         pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
    102 
    103         psubusb     xmm5,                   xmm6              ; p2-=p1
    104         pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
    105 
    106         pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
    107         movdqa      xmm2,                   xmm6              ; p1
    108 
    109         pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
    110 %if %1
    111         movdqa      xmm4,                   [rsi+rax]         ; p0
    112         movdqa      xmm3,                   [rdi]             ; q1
    113 %else
    114         movlps      xmm4,                   [rsi + rcx*2]     ; p0
    115         movhps      xmm4,                   [rdi + rcx*2]
    116         movdqa      xmm3,                   [rsp+_q1]                ; q1
    117 %endif
    118 
    119         movdqa      xmm5,                   xmm4              ; p0
    120         psubusb     xmm4,                   xmm6              ; p0-=p1
    121 
    122         psubusb     xmm6,                   xmm5              ; p1-=p0
    123 
    124         por         xmm6,                   xmm4              ; abs(p1 - p0)
    125         mov         rdx,                    arg(2)            ; get blimit
    126 
    127         movdqa     [rsp+_t1],               xmm6              ; save to t1
    128 
    129         movdqa      xmm4,                   xmm3              ; q1
    130         pmaxub      xmm1,                   xmm6
    131 
    132         psubusb     xmm3,                   xmm2              ; q1-=p1
    133         psubusb     xmm2,                   xmm4              ; p1-=q1
    134 
    135         psubusb     xmm1,                   xmm7
    136         por         xmm2,                   xmm3              ; abs(p1-q1)
    137 
    138         movdqa      xmm7,                   [rdx]             ; blimit
    139         mov         rdx,                    arg(4)            ; hev get thresh
    140 
    141         movdqa      xmm3,                   xmm0              ; q0
    142         pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
    143 
    144         movdqa      xmm6,                   xmm5              ; p0
    145         psrlw       xmm2,                   1                 ; abs(p1-q1)/2
    146 
    147         psubusb     xmm5,                   xmm3              ; p0-=q0
    148         psubusb     xmm3,                   xmm6              ; q0-=p0
    149         por         xmm5,                   xmm3              ; abs(p0 - q0)
    150 
    151         paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
    152 
    153         movdqa      xmm4,                   [rsp+_t0]                ; hev get abs (q1 - q0)
    154         movdqa      xmm3,                   [rsp+_t1]                ; get abs (p1 - p0)
    155 
    156         paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
    157 
    158         movdqa      xmm2,                   [rdx]             ; hev
    159 
    160         psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
    161         psubusb     xmm4,                   xmm2              ; hev
    162 
    163         psubusb     xmm3,                   xmm2              ; hev
    164         por         xmm1,                   xmm5
    165 
    166         pxor        xmm7,                   xmm7
    167         paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
    168 
    169         pcmpeqb     xmm4,                   xmm5              ; hev
    170         pcmpeqb     xmm3,                   xmm3              ; hev
    171 
    172         pcmpeqb     xmm1,                   xmm7              ; mask xmm1
    173         pxor        xmm4,                   xmm3              ; hev
    174 %endmacro
    175 
    176 %macro B_FILTER 1
    177         movdqa      xmm3,                   [GLOBAL(t80)]
    178 %if %1 == 0
    179         movdqa      xmm2,                   [rsp+_p1]                ; p1
    180         movdqa      xmm7,                   [rsp+_q1]                ; q1
    181 %elif %1 == 1
    182         movdqa      xmm2,                   [rsi+2*rax]       ; p1
    183         movdqa      xmm7,                   [rdi]             ; q1
    184 %elif %1 == 2
    185         movdqa      xmm2,                   [rsp+_p1]         ; p1
    186         movdqa      xmm6,                   [rsp+_p0]         ; p0
    187         movdqa      xmm0,                   [rsp+_q0]         ; q0
    188         movdqa      xmm7,                   [rsp+_q1]         ; q1
    189 %endif
    190 
    191         pxor        xmm2,                   xmm3              ; p1 offset to convert to signed values
    192         pxor        xmm7,                   xmm3              ; q1 offset to convert to signed values
    193 
    194         psubsb      xmm2,                   xmm7              ; p1 - q1
    195         pxor        xmm6,                   xmm3              ; offset to convert to signed values
    196 
    197         pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
    198         pxor        xmm0,                   xmm3              ; offset to convert to signed values
    199 
    200         movdqa      xmm3,                   xmm0              ; q0
    201         psubsb      xmm0,                   xmm6              ; q0 - p0
    202         paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
    203         paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
    204         paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
    205         pand        xmm1,                   xmm2              ; mask filter values we don't care about
    206 
    207         movdqa      xmm2,                   xmm1
    208         paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
    209         paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
    210 
    211         punpckhbw   xmm5,                   xmm2              ; axbxcxdx
    212         punpcklbw   xmm2,                   xmm2              ; exfxgxhx
    213 
    214         punpcklbw   xmm0,                   xmm1              ; exfxgxhx
    215         psraw       xmm5,                   11                ; sign extended shift right by 3
    216 
    217         punpckhbw   xmm1,                   xmm1              ; axbxcxdx
    218         psraw       xmm2,                   11                ; sign extended shift right by 3
    219 
    220         packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
    221         psraw       xmm0,                   11                ; sign extended shift right by 3
    222 
    223         psraw       xmm1,                   11                ; sign extended shift right by 3
    224         movdqa      xmm5,                   xmm0              ; save results
    225 
    226         packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
    227 
    228         paddsb      xmm6,                   xmm2              ; p0+= p0 add
    229 
    230         movdqa      xmm2,                   [GLOBAL(ones)]
    231         paddsw      xmm5,                   xmm2
    232         paddsw      xmm1,                   xmm2
    233         psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
    234         psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
    235         packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
    236         movdqa      xmm2,                   [GLOBAL(t80)]
    237 
    238 %if %1 == 0
    239         movdqa      xmm1,                   [rsp+_p1]         ; p1
    240         lea         rsi,                    [rsi + rcx*2]
    241         lea         rdi,                    [rdi + rcx*2]
    242 %elif %1 == 1
    243         movdqa      xmm1,                   [rsi+2*rax]       ; p1
    244 %elif %1 == 2
    245         movdqa      xmm1,                   [rsp+_p1]         ; p1
    246 %endif
    247 
    248         pandn       xmm4,                   xmm5              ; high edge variance additive
    249         pxor        xmm6,                   xmm2              ; unoffset
    250 
    251         pxor        xmm1,                   xmm2              ; reoffset
    252         psubsb      xmm3,                   xmm0              ; q0-= q0 add
    253 
    254         paddsb      xmm1,                   xmm4              ; p1+= p1 add
    255         pxor        xmm3,                   xmm2              ; unoffset
    256 
    257         pxor        xmm1,                   xmm2              ; unoffset
    258         psubsb      xmm7,                   xmm4              ; q1-= q1 add
    259 
    260         pxor        xmm7,                   xmm2              ; unoffset
    261 %if %1 == 0
    262         movq        [rsi],                  xmm6              ; p0
    263         movhps      [rdi],                  xmm6
    264         movq        [rsi + rax],            xmm1              ; p1
    265         movhps      [rdi + rax],            xmm1
    266         movq        [rsi + rcx],            xmm3              ; q0
    267         movhps      [rdi + rcx],            xmm3
    268         movq        [rsi + rcx*2],          xmm7              ; q1
    269         movhps      [rdi + rcx*2],          xmm7
    270 %elif %1 == 1
    271         movdqa      [rsi+rax],              xmm6              ; write back
    272         movdqa      [rsi+2*rax],            xmm1              ; write back
    273         movdqa      [rsi],                  xmm3              ; write back
    274         movdqa      [rdi],                  xmm7              ; write back
    275 %endif
    276 
    277 %endmacro
    278 
    279 SECTION .text
    280 
    281 %if ABI_IS_32BIT
    282 
    283 ;void vp8_loop_filter_horizontal_edge_sse2
    284 ;(
    285 ;    unsigned char *src_ptr,
    286 ;    int            src_pixel_step,
    287 ;    const char    *blimit,
    288 ;    const char    *limit,
    289 ;    const char    *thresh,
    290 ;)
    291 global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE
    292 sym(vp8_loop_filter_horizontal_edge_sse2):
    293     push        rbp
    294     mov         rbp, rsp
    295     SHADOW_ARGS_TO_STACK 5
    296     SAVE_XMM 7
    297     GET_GOT     rbx
    298     push        rsi
    299     push        rdi
    300     ; end prolog
    301 
    302     ALIGN_STACK 16, rax
    303     sub         rsp, lf_var_size
    304 
    305         mov         rsi,                    arg(0)           ;src_ptr
    306         movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
    307 
    308         mov         rdx,                    arg(3)           ;limit
    309 
    310         lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
    311 
    312         ; calculate breakout conditions and high edge variance
    313         LFH_FILTER_AND_HEV_MASK 1
    314         ; filter and write back the result
    315         B_FILTER 1
    316 
    317     add rsp, lf_var_size
    318     pop rsp
    319     ; begin epilog
    320     pop rdi
    321     pop rsi
    322     RESTORE_GOT
    323     RESTORE_XMM
    324     UNSHADOW_ARGS
    325     pop         rbp
    326     ret
    327 
    328 %endif
    329 
    330 ;void vp8_loop_filter_horizontal_edge_uv_sse2
    331 ;(
    332 ;    unsigned char *src_ptr,
    333 ;    int            src_pixel_step,
    334 ;    const char    *blimit,
    335 ;    const char    *limit,
    336 ;    const char    *thresh,
    337 ;    int            count
    338 ;)
    339 global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE
    340 sym(vp8_loop_filter_horizontal_edge_uv_sse2):
    341     push        rbp
    342     mov         rbp, rsp
    343     SHADOW_ARGS_TO_STACK 6
    344     SAVE_XMM 7
    345     GET_GOT     rbx
    346     push        rsi
    347     push        rdi
    348     ; end prolog
    349 
    350     ALIGN_STACK 16, rax
    351     sub         rsp, lf_var_size
    352 
    353         mov         rsi,                    arg(0)             ; u
    354         mov         rdi,                    arg(5)             ; v
    355         movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
    356         mov         rcx,                    rax
    357         neg         rax                     ; negate pitch to deal with above border
    358 
    359         mov         rdx,                    arg(3)             ;limit
    360 
    361         lea         rsi,                    [rsi + rcx]
    362         lea         rdi,                    [rdi + rcx]
    363 
    364         ; calculate breakout conditions and high edge variance
    365         LFH_FILTER_AND_HEV_MASK 0
    366         ; filter and write back the result
    367         B_FILTER 0
    368 
    369     add rsp, lf_var_size
    370     pop rsp
    371     ; begin epilog
    372     pop rdi
    373     pop rsi
    374     RESTORE_GOT
    375     RESTORE_XMM
    376     UNSHADOW_ARGS
    377     pop         rbp
    378     ret
    379 
    380 
    381 %macro MB_FILTER_AND_WRITEBACK 1
    382         movdqa      xmm3,                   [GLOBAL(t80)]
    383 %if %1 == 0
    384         movdqa      xmm2,                   [rsp+_p1]              ; p1
    385         movdqa      xmm7,                   [rsp+_q1]              ; q1
    386 %elif %1 == 1
    387         movdqa      xmm2,                   [rsi+2*rax]     ; p1
    388         movdqa      xmm7,                   [rdi]           ; q1
    389 
    390         mov         rcx,                    rax
    391         neg         rcx
    392 %elif %1 == 2
    393         movdqa      xmm2,                   [rsp+_p1]       ; p1
    394         movdqa      xmm6,                   [rsp+_p0]       ; p0
    395         movdqa      xmm0,                   [rsp+_q0]       ; q0
    396         movdqa      xmm7,                   [rsp+_q1]       ; q1
    397 %endif
    398 
    399         pxor        xmm2,                   xmm3            ; p1 offset to convert to signed values
    400         pxor        xmm7,                   xmm3            ; q1 offset to convert to signed values
    401         pxor        xmm6,                   xmm3            ; offset to convert to signed values
    402         pxor        xmm0,                   xmm3            ; offset to convert to signed values
    403 
    404         psubsb      xmm2,                   xmm7            ; p1 - q1
    405 
    406         movdqa      xmm3,                   xmm0            ; q0
    407         psubsb      xmm0,                   xmm6            ; q0 - p0
    408         paddsb      xmm2,                   xmm0            ; 1 * (q0 - p0) + (p1 - q1)
    409         paddsb      xmm2,                   xmm0            ; 2 * (q0 - p0)
    410         paddsb      xmm2,                   xmm0            ; 3 * (q0 - p0) + (p1 - q1)
    411         pand        xmm1,                   xmm2            ; mask filter values we don't care about
    412 
    413         movdqa      xmm2,                   xmm1            ; vp8_filter
    414 
    415         pand        xmm2,                   xmm4            ; Filter2 = vp8_filter & hev
    416         pxor        xmm0,                   xmm0
    417 
    418         pandn       xmm4,                   xmm1            ; vp8_filter&=~hev
    419         pxor        xmm1,                   xmm1
    420 
    421         punpcklbw   xmm0,                   xmm4            ; Filter 2 (hi)
    422         punpckhbw   xmm1,                   xmm4            ; Filter 2 (lo)
    423 
    424         movdqa      xmm5,                   xmm2
    425 
    426         movdqa      xmm4,                   [GLOBAL(s9)]
    427         paddsb      xmm5,                   [GLOBAL(t3)]    ; vp8_signed_char_clamp(Filter2 + 3)
    428         paddsb      xmm2,                   [GLOBAL(t4)]    ; vp8_signed_char_clamp(Filter2 + 4)
    429 
    430         pmulhw      xmm1,                   xmm4            ; Filter 2 (lo) * 9
    431         pmulhw      xmm0,                   xmm4            ; Filter 2 (hi) * 9
    432 
    433         punpckhbw   xmm7,                   xmm5            ; axbxcxdx
    434         punpcklbw   xmm5,                   xmm5            ; exfxgxhx
    435 
    436         psraw       xmm7,                   11              ; sign extended shift right by 3
    437 
    438         psraw       xmm5,                   11              ; sign extended shift right by 3
    439         punpckhbw   xmm4,                   xmm2            ; axbxcxdx
    440 
    441         punpcklbw   xmm2,                   xmm2            ; exfxgxhx
    442         psraw       xmm4,                   11              ; sign extended shift right by 3
    443 
    444         packsswb    xmm5,                   xmm7            ; Filter2 >>=3;
    445         psraw       xmm2,                   11              ; sign extended shift right by 3
    446 
    447         packsswb    xmm2,                   xmm4            ; Filter1 >>=3;
    448 
    449         paddsb      xmm6,                   xmm5            ; ps0 =ps0 + Fitler2
    450 
    451         psubsb      xmm3,                   xmm2            ; qs0 =qs0 - Filter1
    452         movdqa      xmm7,                   xmm1
    453 
    454         movdqa      xmm4,                   [GLOBAL(s63)]
    455         movdqa      xmm5,                   xmm0
    456         movdqa      xmm2,                   xmm5
    457         paddw       xmm0,                   xmm4            ; Filter 2 (hi) * 9 + 63
    458         paddw       xmm1,                   xmm4            ; Filter 2 (lo) * 9 + 63
    459         movdqa      xmm4,                   xmm7
    460 
    461         paddw       xmm5,                   xmm5            ; Filter 2 (hi) * 18
    462 
    463         paddw       xmm7,                   xmm7            ; Filter 2 (lo) * 18
    464         paddw       xmm5,                   xmm0            ; Filter 2 (hi) * 27 + 63
    465 
    466         paddw       xmm7,                   xmm1            ; Filter 2 (lo) * 27 + 63
    467         paddw       xmm2,                   xmm0            ; Filter 2 (hi) * 18 + 63
    468         psraw       xmm0,                   7               ; (Filter 2 (hi) * 9 + 63) >> 7
    469 
    470         paddw       xmm4,                   xmm1            ; Filter 2 (lo) * 18 + 63
    471         psraw       xmm1,                   7               ; (Filter 2 (lo) * 9 + 63) >> 7
    472         psraw       xmm2,                   7               ; (Filter 2 (hi) * 18 + 63) >> 7
    473 
    474         packsswb    xmm0,                   xmm1            ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
    475 
    476         psraw       xmm4,                   7               ; (Filter 2 (lo) * 18 + 63) >> 7
    477         psraw       xmm5,                   7               ; (Filter 2 (hi) * 27 + 63) >> 7
    478         psraw       xmm7,                   7               ; (Filter 2 (lo) * 27 + 63) >> 7
    479 
    480         packsswb    xmm5,                   xmm7            ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
    481         packsswb    xmm2,                   xmm4            ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
    482         movdqa      xmm7,                   [GLOBAL(t80)]
    483 
    484 %if %1 == 0
    485         movdqa      xmm1,                   [rsp+_q1]       ; q1
    486         movdqa      xmm4,                   [rsp+_p1]       ; p1
    487         lea         rsi,                    [rsi+rcx*2]
    488         lea         rdi,                    [rdi+rcx*2]
    489 
    490 %elif %1 == 1
    491         movdqa      xmm1,                   [rdi]           ; q1
    492         movdqa      xmm4,                   [rsi+rax*2]     ; p1
    493 %elif %1 == 2
    494         movdqa      xmm4,                   [rsp+_p1]       ; p1
    495         movdqa      xmm1,                   [rsp+_q1]       ; q1
    496 %endif
    497 
    498         pxor        xmm1,                   xmm7
    499         pxor        xmm4,                   xmm7
    500 
    501         psubsb      xmm3,                   xmm5            ; sq = vp8_signed_char_clamp(qs0 - u3)
    502         paddsb      xmm6,                   xmm5            ; sp = vp8_signed_char_clamp(ps0 - u3)
    503         psubsb      xmm1,                   xmm2            ; sq = vp8_signed_char_clamp(qs1 - u2)
    504         paddsb      xmm4,                   xmm2            ; sp = vp8_signed_char_clamp(ps1 - u2)
    505 
    506 %if %1 == 1
    507         movdqa      xmm2,                   [rdi+rax*4]     ; p2
    508         movdqa      xmm5,                   [rdi+rcx]       ; q2
    509 %else
    510         movdqa      xmm2,                   [rsp+_p2]       ; p2
    511         movdqa      xmm5,                   [rsp+_q2]       ; q2
    512 %endif
    513 
    514         pxor        xmm1,                   xmm7            ; *oq1 = sq^0x80;
    515         pxor        xmm4,                   xmm7            ; *op1 = sp^0x80;
    516         pxor        xmm2,                   xmm7
    517         pxor        xmm5,                   xmm7
    518         paddsb      xmm2,                   xmm0            ; sp = vp8_signed_char_clamp(ps2 - u)
    519         psubsb      xmm5,                   xmm0            ; sq = vp8_signed_char_clamp(qs2 - u)
    520         pxor        xmm2,                   xmm7            ; *op2 = sp^0x80;
    521         pxor        xmm5,                   xmm7            ; *oq2 = sq^0x80;
    522         pxor        xmm3,                   xmm7            ; *oq0 = sq^0x80
    523         pxor        xmm6,                   xmm7            ; *oq0 = sp^0x80
    524 %if %1 == 0
    525         movq        [rsi],                  xmm6            ; p0
    526         movhps      [rdi],                  xmm6
    527         movq        [rsi + rcx],            xmm3            ; q0
    528         movhps      [rdi + rcx],            xmm3
    529         lea         rdx,                    [rcx + rcx*2]
    530         movq        [rsi+rcx*2],            xmm1            ; q1
    531         movhps      [rdi+rcx*2],            xmm1
    532 
    533         movq        [rsi + rax],            xmm4            ; p1
    534         movhps      [rdi + rax],            xmm4
    535 
    536         movq        [rsi+rax*2],            xmm2            ; p2
    537         movhps      [rdi+rax*2],            xmm2
    538 
    539         movq        [rsi+rdx],              xmm5            ; q2
    540         movhps      [rdi+rdx],              xmm5
    541 %elif %1 == 1
    542         movdqa      [rdi+rcx],              xmm5            ; q2
    543         movdqa      [rdi],                  xmm1            ; q1
    544         movdqa      [rsi],                  xmm3            ; q0
    545         movdqa      [rsi+rax  ],            xmm6            ; p0
    546         movdqa      [rsi+rax*2],            xmm4            ; p1
    547         movdqa      [rdi+rax*4],            xmm2            ; p2
    548 %elif %1 == 2
    549         movdqa      [rsp+_p1],              xmm4            ; p1
    550         movdqa      [rsp+_p0],              xmm6            ; p0
    551         movdqa      [rsp+_q0],              xmm3            ; q0
    552         movdqa      [rsp+_q1],              xmm1            ; q1
    553 %endif
    554 
    555 %endmacro
    556 
    557 
    558 ;void vp8_mbloop_filter_horizontal_edge_sse2
    559 ;(
    560 ;    unsigned char *src_ptr,
    561 ;    int            src_pixel_step,
    562 ;    const char    *blimit,
    563 ;    const char    *limit,
    564 ;    const char    *thresh,
    565 ;)
    566 global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE
    567 sym(vp8_mbloop_filter_horizontal_edge_sse2):
    568     push        rbp
    569     mov         rbp, rsp
    570     SHADOW_ARGS_TO_STACK 5
    571     SAVE_XMM 7
    572     GET_GOT     rbx
    573     push        rsi
    574     push        rdi
    575     ; end prolog
    576 
    577     ALIGN_STACK 16, rax
    578     sub         rsp, lf_var_size
    579 
    580         mov         rsi,                    arg(0)            ;src_ptr
    581         movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step
    582         mov         rdx,                    arg(3)            ;limit
    583 
    584         lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing
    585 
    586         ; calculate breakout conditions and high edge variance
    587         LFH_FILTER_AND_HEV_MASK 1
    588         ; filter and write back the results
    589         MB_FILTER_AND_WRITEBACK 1
    590 
    591     add rsp, lf_var_size
    592     pop rsp
    593     ; begin epilog
    594     pop rdi
    595     pop rsi
    596     RESTORE_GOT
    597     RESTORE_XMM
    598     UNSHADOW_ARGS
    599     pop         rbp
    600     ret
    601 
    602 
    603 ;void vp8_mbloop_filter_horizontal_edge_uv_sse2
    604 ;(
    605 ;    unsigned char *u,
    606 ;    int            src_pixel_step,
    607 ;    const char    *blimit,
    608 ;    const char    *limit,
    609 ;    const char    *thresh,
    610 ;    unsigned char *v
    611 ;)
    612 global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE
    613 sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
    614     push        rbp
    615     mov         rbp, rsp
    616     SHADOW_ARGS_TO_STACK 6
    617     SAVE_XMM 7
    618     GET_GOT     rbx
    619     push        rsi
    620     push        rdi
    621     ; end prolog
    622 
    623     ALIGN_STACK 16, rax
    624     sub         rsp, lf_var_size
    625 
    626         mov         rsi,                    arg(0)             ; u
    627         mov         rdi,                    arg(5)             ; v
    628         movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
    629         mov         rcx,                    rax
    630         neg         rax                     ; negate pitch to deal with above border
    631         mov         rdx,                    arg(3)             ;limit
    632 
    633         lea         rsi,                    [rsi + rcx]
    634         lea         rdi,                    [rdi + rcx]
    635 
    636         ; calculate breakout conditions and high edge variance
    637         LFH_FILTER_AND_HEV_MASK 0
    638         ; filter and write back the results
    639         MB_FILTER_AND_WRITEBACK 0
    640 
    641     add rsp, lf_var_size
    642     pop rsp
    643     ; begin epilog
    644     pop rdi
    645     pop rsi
    646     RESTORE_GOT
    647     RESTORE_XMM
    648     UNSHADOW_ARGS
    649     pop         rbp
    650     ret
    651 
    652 
    653 %macro TRANSPOSE_16X8 2
    654         movq        xmm4,               [rsi]           ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
    655         movq        xmm1,               [rdi]           ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
    656         movq        xmm0,               [rsi+2*rax]     ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
    657         movq        xmm7,               [rdi+2*rax]     ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
    658         movq        xmm5,               [rsi+4*rax]     ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
    659         movq        xmm2,               [rdi+4*rax]     ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
    660 
    661         punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
    662 
    663         movq        xmm1,               [rdi+2*rcx]     ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
    664 
    665         movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
    666         punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
    667 
    668         movq        xmm7,               [rsi+2*rcx]     ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
    669 
    670         punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
    671 %if %1
    672         lea         rsi,                [rsi+rax*8]
    673         lea         rdi,                [rdi+rax*8]
    674 %else
    675         mov         rsi,                arg(5)          ; v_ptr
    676 %endif
    677 
    678         movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
    679         punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
    680         punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
    681         punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
    682         punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
    683 
    684 %if %1 == 0
    685         lea         rdi,                [rsi + rax - 4] ; rdi points to row +1 for indirect addressing
    686         lea         rsi,                [rsi - 4]
    687 %endif
    688 
    689         movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
    690         punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
    691 
    692         movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
    693         punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
    694 
    695         punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
    696 
    697         punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
    698 
    699         punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
    700 
    701         movdqa      [rsp+_t0],          xmm2            ; save to free XMM2
    702 
    703         movq        xmm2,               [rsi]           ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
    704         movq        xmm6,               [rdi]           ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
    705         movq        xmm0,               [rsi+2*rax]     ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
    706         movq        xmm5,               [rdi+2*rax]     ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
    707         movq        xmm1,               [rsi+4*rax]     ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
    708 
    709         punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
    710 
    711         movq        xmm6,               [rdi+4*rax]     ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
    712 
    713         punpcklbw   xmm0,               xmm5            ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
    714 
    715         movq        xmm5,               [rsi+2*rcx]     ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
    716 
    717         punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
    718 
    719         movq        xmm6,               [rdi+2*rcx]     ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
    720 
    721         punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
    722 
    723         movdqa      xmm6,               xmm1            ;
    724         punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
    725 
    726         punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
    727         movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
    728 
    729         punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
    730 
    731         punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
    732 
    733         movdqa      xmm0,               xmm5
    734         punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
    735 
    736         punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
    737         movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
    738 
    739         punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
    740 
    741         punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
    742         movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
    743 
    744         punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
    745 
    746         punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
    747 
    748 %if %2 == 0
    749         movdqa      [rsp+_q3],          xmm7            ; save 7
    750         movdqa      [rsp+_q2],          xmm6            ; save 6
    751 %endif
    752         movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
    753         punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
    754         punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
    755         movdqa      [rsp+_p1],          xmm2            ; save 2
    756 
    757         movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
    758         punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
    759         movdqa      [rsp+_p0],          xmm3            ; save 3
    760 
    761         punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
    762 
    763         movdqa      [rsp+_q0],          xmm4            ; save 4
    764         movdqa      [rsp+_q1],          xmm5            ; save 5
    765         movdqa      xmm1,               [rsp+_t0]
    766 
    767         movdqa      xmm2,               xmm1            ;
    768         punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
    769         punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
    770 
    771 %if %2 == 0
    772         movdqa      [rsp+_p2],          xmm1
    773         movdqa      [rsp+_p3],          xmm2
    774 %endif
    775 
    776 %endmacro
    777 
    778 %macro LFV_FILTER_MASK_HEV_MASK 0
    779         movdqa      xmm0,               xmm6            ; q2
    780         psubusb     xmm0,               xmm7            ; q2-q3
    781 
    782         psubusb     xmm7,               xmm6            ; q3-q2
    783         movdqa      xmm4,               xmm5            ; q1
    784 
    785         por         xmm7,               xmm0            ; abs (q3-q2)
    786         psubusb     xmm4,               xmm6            ; q1-q2
    787 
    788         movdqa      xmm0,               xmm1
    789         psubusb     xmm6,               xmm5            ; q2-q1
    790 
    791         por         xmm6,               xmm4            ; abs (q2-q1)
    792         psubusb     xmm0,               xmm2            ; p2 - p3;
    793 
    794         psubusb     xmm2,               xmm1            ; p3 - p2;
    795         por         xmm0,               xmm2            ; abs(p2-p3)
    796 
    797         movdqa      xmm5,               [rsp+_p1]       ; p1
    798         pmaxub      xmm0,               xmm7
    799 
    800         movdqa      xmm2,               xmm5            ; p1
    801         psubusb     xmm5,               xmm1            ; p1-p2
    802         psubusb     xmm1,               xmm2            ; p2-p1
    803 
    804         movdqa      xmm7,               xmm3            ; p0
    805         psubusb     xmm7,               xmm2            ; p0-p1
    806 
    807         por         xmm1,               xmm5            ; abs(p2-p1)
    808         pmaxub      xmm0,               xmm6
    809 
    810         pmaxub      xmm0,               xmm1
    811         movdqa      xmm1,               xmm2            ; p1
    812 
    813         psubusb     xmm2,               xmm3            ; p1-p0
    814 
    815         por         xmm2,               xmm7            ; abs(p1-p0)
    816 
    817         pmaxub      xmm0,               xmm2
    818 
    819         movdqa      xmm5,               [rsp+_q0]       ; q0
    820         movdqa      xmm7,               [rsp+_q1]       ; q1
    821 
    822         mov         rdx,                arg(3)          ; limit
    823 
    824         movdqa      xmm6,               xmm5            ; q0
    825         movdqa      xmm4,               xmm7            ; q1
    826 
    827         psubusb     xmm5,               xmm7            ; q0-q1
    828         psubusb     xmm7,               xmm6            ; q1-q0
    829 
    830         por         xmm7,               xmm5            ; abs(q1-q0)
    831 
    832         pmaxub      xmm0,               xmm7
    833 
    834         psubusb     xmm0,               [rdx]           ; limit
    835 
    836         mov         rdx,                arg(2)          ; blimit
    837         movdqa      xmm5,               xmm4            ; q1
    838 
    839         psubusb     xmm5,               xmm1            ; q1-=p1
    840         psubusb     xmm1,               xmm4            ; p1-=q1
    841 
    842         por         xmm5,               xmm1            ; abs(p1-q1)
    843         movdqa      xmm1,               xmm3            ; p0
    844 
    845         pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
    846         psubusb     xmm1,               xmm6            ; p0-q0
    847 
    848         movdqa      xmm4,               [rdx]           ; blimit
    849         mov         rdx,                arg(4)          ; get thresh
    850 
    851         psrlw       xmm5,               1               ; abs(p1-q1)/2
    852         psubusb     xmm6,               xmm3            ; q0-p0
    853 
    854         por         xmm1,               xmm6            ; abs(q0-p0)
    855         paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
    856         movdqa      xmm3,               [rdx]
    857 
    858         paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
    859         psubusb     xmm2,               xmm3            ; abs(q1 - q0) > thresh
    860 
    861         psubusb     xmm7,               xmm3            ; abs(p1 - p0)> thresh
    862 
    863         psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
    864         por         xmm2,               xmm7            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
    865 
    866         por         xmm1,               xmm0            ; mask
    867         pcmpeqb     xmm2,               xmm0
    868 
    869         pxor        xmm0,               xmm0
    870         pcmpeqb     xmm4,               xmm4
    871 
    872         pcmpeqb     xmm1,               xmm0
    873         pxor        xmm4,               xmm2
    874 %endmacro
    875 
    876 %macro BV_TRANSPOSE 0
    877         ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
    878         ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
    879         ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
    880         ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
    881         movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
    882         punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
    883 
    884         movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
    885         punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
    886 
    887         punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
    888 
    889         punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
    890 
    891         movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
    892         punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
    893 
    894         punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
    895         movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
    896 
    897         punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
    898 
    899         punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
    900         ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
    901         ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
    902         ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
    903         ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
    904 %endmacro
    905 
    906 %macro BV_WRITEBACK 2
    907         movd        [rsi+2],            %1
    908         movd        [rsi+4*rax+2],      %2
    909         psrldq      %1,                 4
    910         psrldq      %2,                 4
    911         movd        [rdi+2],            %1
    912         movd        [rdi+4*rax+2],      %2
    913         psrldq      %1,                 4
    914         psrldq      %2,                 4
    915         movd        [rsi+2*rax+2],      %1
    916         movd        [rsi+2*rcx+2],      %2
    917         psrldq      %1,                 4
    918         psrldq      %2,                 4
    919         movd        [rdi+2*rax+2],      %1
    920         movd        [rdi+2*rcx+2],      %2
    921 %endmacro
    922 
    923 %if ABI_IS_32BIT
    924 
    925 ;void vp8_loop_filter_vertical_edge_sse2
    926 ;(
    927 ;    unsigned char *src_ptr,
    928 ;    int            src_pixel_step,
    929 ;    const char    *blimit,
    930 ;    const char    *limit,
    931 ;    const char    *thresh,
    932 ;)
    933 global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE
    934 sym(vp8_loop_filter_vertical_edge_sse2):
    935     push        rbp
    936     mov         rbp, rsp
    937     SHADOW_ARGS_TO_STACK 5
    938     SAVE_XMM 7
    939     GET_GOT     rbx
    940     push        rsi
    941     push        rdi
    942     ; end prolog
    943 
    944     ALIGN_STACK 16, rax
    945     sub             rsp, lf_var_size
    946 
    947         mov         rsi,        arg(0)                  ; src_ptr
    948         movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
    949 
    950         lea         rsi,        [rsi - 4]
    951         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
    952         lea         rcx,        [rax*2+rax]
    953 
    954         ;transpose 16x8 to 8x16, and store the 8-line result on stack.
    955         TRANSPOSE_16X8 1, 1
    956 
    957         ; calculate filter mask and high edge variance
    958         LFV_FILTER_MASK_HEV_MASK
    959 
    960         ; start work on filters
    961         B_FILTER 2
    962 
    963         ; transpose and write back - only work on q1, q0, p0, p1
    964         BV_TRANSPOSE
    965         ; store 16-line result
    966 
    967         lea         rdx,        [rax]
    968         neg         rdx
    969 
    970         BV_WRITEBACK xmm1, xmm5
    971 
    972         lea         rsi,        [rsi+rdx*8]
    973         lea         rdi,        [rdi+rdx*8]
    974         BV_WRITEBACK xmm2, xmm6
    975 
    976     add rsp, lf_var_size
    977     pop rsp
    978     ; begin epilog
    979     pop rdi
    980     pop rsi
    981     RESTORE_GOT
    982     RESTORE_XMM
    983     UNSHADOW_ARGS
    984     pop         rbp
    985     ret
    986 
    987 %endif
    988 
    989 ;void vp8_loop_filter_vertical_edge_uv_sse2
    990 ;(
    991 ;    unsigned char *u,
    992 ;    int            src_pixel_step,
    993 ;    const char    *blimit,
    994 ;    const char    *limit,
    995 ;    const char    *thresh,
    996 ;    unsigned char *v
    997 ;)
    998 global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE
    999 sym(vp8_loop_filter_vertical_edge_uv_sse2):
   1000     push        rbp
   1001     mov         rbp, rsp
   1002     SHADOW_ARGS_TO_STACK 6
   1003     SAVE_XMM 7
   1004     GET_GOT     rbx
   1005     push        rsi
   1006     push        rdi
   1007     ; end prolog
   1008 
   1009     ALIGN_STACK 16, rax
   1010     sub             rsp, lf_var_size
   1011 
   1012         mov         rsi,        arg(0)                  ; u_ptr
   1013         movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
   1014 
   1015         lea         rsi,        [rsi - 4]
   1016         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
   1017         lea         rcx,        [rax+2*rax]
   1018 
   1019         ;transpose 16x8 to 8x16, and store the 8-line result on stack.
   1020         TRANSPOSE_16X8 0, 1
   1021 
   1022         ; calculate filter mask and high edge variance
   1023         LFV_FILTER_MASK_HEV_MASK
   1024 
   1025         ; start work on filters
   1026         B_FILTER 2
   1027 
   1028         ; transpose and write back - only work on q1, q0, p0, p1
   1029         BV_TRANSPOSE
   1030 
   1031         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
   1032 
   1033         ; store 16-line result
   1034         BV_WRITEBACK xmm1, xmm5
   1035 
   1036         mov         rsi,        arg(0)                  ; u_ptr
   1037         lea         rsi,        [rsi - 4]
   1038         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
   1039         BV_WRITEBACK xmm2, xmm6
   1040 
   1041     add rsp, lf_var_size
   1042     pop rsp
   1043     ; begin epilog
   1044     pop rdi
   1045     pop rsi
   1046     RESTORE_GOT
   1047     RESTORE_XMM
   1048     UNSHADOW_ARGS
   1049     pop         rbp
   1050     ret
   1051 
   1052 %macro MBV_TRANSPOSE 0
   1053         movdqa      xmm0,               [rsp+_p3]           ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
   1054         movdqa      xmm1,               xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
   1055 
   1056         punpcklbw   xmm0,               xmm2                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
   1057         punpckhbw   xmm1,               xmm2                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
   1058 
   1059         movdqa      xmm7,               [rsp+_p1]           ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1060         movdqa      xmm6,               xmm7                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1061 
   1062         punpcklbw   xmm7,               [rsp+_p0]           ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
   1063         punpckhbw   xmm6,               [rsp+_p0]           ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
   1064 
   1065         movdqa      xmm3,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
   1066         punpcklwd   xmm0,               xmm7                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
   1067 
   1068         punpckhwd   xmm3,               xmm7                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
   1069         movdqa      xmm4,               xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
   1070 
   1071         punpcklwd   xmm1,               xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
   1072         punpckhwd   xmm4,               xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
   1073 
   1074         movdqa      xmm7,               [rsp+_q0]           ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
   1075         punpcklbw   xmm7,               [rsp+_q1]           ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
   1076 
   1077         movdqa      xmm6,               xmm5                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
   1078         punpcklbw   xmm6,               [rsp+_q3]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
   1079 
   1080         movdqa      xmm2,               xmm7                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
   1081         punpcklwd   xmm7,               xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
   1082 
   1083         punpckhwd   xmm2,               xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
   1084         movdqa      xmm6,               xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
   1085 
   1086         punpckldq   xmm0,               xmm7                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
   1087         punpckhdq   xmm6,               xmm7                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
   1088 %endmacro
   1089 
   1090 %macro MBV_WRITEBACK_1 0
   1091         movq        [rsi],              xmm0
   1092         movhps      [rdi],              xmm0
   1093 
   1094         movq        [rsi+2*rax],        xmm6
   1095         movhps      [rdi+2*rax],        xmm6
   1096 
   1097         movdqa      xmm0,               xmm3                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
   1098         punpckldq   xmm0,               xmm2                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
   1099         punpckhdq   xmm3,               xmm2                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
   1100 
   1101         movq        [rsi+4*rax],        xmm0
   1102         movhps      [rdi+4*rax],        xmm0
   1103 
   1104         movq        [rsi+2*rcx],        xmm3
   1105         movhps      [rdi+2*rcx],        xmm3
   1106 
   1107         movdqa      xmm7,               [rsp+_q0]           ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
   1108         punpckhbw   xmm7,               [rsp+_q1]           ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
   1109         punpckhbw   xmm5,               [rsp+_q3]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
   1110 
   1111         movdqa      xmm0,               xmm7
   1112         punpcklwd   xmm0,               xmm5                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
   1113         punpckhwd   xmm7,               xmm5                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
   1114 
   1115         movdqa      xmm5,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
   1116         punpckldq   xmm1,               xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
   1117         punpckhdq   xmm5,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
   1118 %endmacro
   1119 
   1120 %macro MBV_WRITEBACK_2 0
   1121         movq        [rsi],              xmm1
   1122         movhps      [rdi],              xmm1
   1123 
   1124         movq        [rsi+2*rax],        xmm5
   1125         movhps      [rdi+2*rax],        xmm5
   1126 
   1127         movdqa      xmm1,               xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
   1128         punpckldq   xmm1,               xmm7                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
   1129         punpckhdq   xmm4,               xmm7                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
   1130 
   1131         movq        [rsi+4*rax],        xmm1
   1132         movhps      [rdi+4*rax],        xmm1
   1133 
   1134         movq        [rsi+2*rcx],        xmm4
   1135         movhps      [rdi+2*rcx],        xmm4
   1136 %endmacro
   1137 
   1138 
   1139 ;void vp8_mbloop_filter_vertical_edge_sse2
   1140 ;(
   1141 ;    unsigned char *src_ptr,
   1142 ;    int            src_pixel_step,
   1143 ;    const char    *blimit,
   1144 ;    const char    *limit,
   1145 ;    const char    *thresh,
   1146 ;)
   1147 global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE
   1148 sym(vp8_mbloop_filter_vertical_edge_sse2):
   1149     push        rbp
   1150     mov         rbp, rsp
   1151     SHADOW_ARGS_TO_STACK 5
   1152     SAVE_XMM 7
   1153     GET_GOT     rbx
   1154     push        rsi
   1155     push        rdi
   1156     ; end prolog
   1157 
   1158     ALIGN_STACK 16, rax
   1159     sub          rsp, lf_var_size
   1160 
   1161         mov         rsi,                arg(0)              ; src_ptr
   1162         movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
   1163 
   1164         lea         rsi,                [rsi - 4]
   1165         lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
   1166         lea         rcx,                [rax*2+rax]
   1167 
   1168         ; Transpose
   1169         TRANSPOSE_16X8 1, 0
   1170 
   1171         ; calculate filter mask and high edge variance
   1172         LFV_FILTER_MASK_HEV_MASK
   1173 
   1174         neg         rax
   1175         ; start work on filters
   1176         MB_FILTER_AND_WRITEBACK 2
   1177 
   1178         lea         rsi,                [rsi+rax*8]
   1179         lea         rdi,                [rdi+rax*8]
   1180 
   1181         ; transpose and write back
   1182         MBV_TRANSPOSE
   1183 
   1184         neg         rax
   1185 
   1186         MBV_WRITEBACK_1
   1187 
   1188 
   1189         lea         rsi,                [rsi+rax*8]
   1190         lea         rdi,                [rdi+rax*8]
   1191         MBV_WRITEBACK_2
   1192 
   1193     add rsp, lf_var_size
   1194     pop rsp
   1195     ; begin epilog
   1196     pop rdi
   1197     pop rsi
   1198     RESTORE_GOT
   1199     RESTORE_XMM
   1200     UNSHADOW_ARGS
   1201     pop         rbp
   1202     ret
   1203 
   1204 
   1205 ;void vp8_mbloop_filter_vertical_edge_uv_sse2
   1206 ;(
   1207 ;    unsigned char *u,
   1208 ;    int            src_pixel_step,
   1209 ;    const char    *blimit,
   1210 ;    const char    *limit,
   1211 ;    const char    *thresh,
   1212 ;    unsigned char *v
   1213 ;)
   1214 global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE
   1215 sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
   1216     push        rbp
   1217     mov         rbp, rsp
   1218     SHADOW_ARGS_TO_STACK 6
   1219     SAVE_XMM 7
   1220     GET_GOT     rbx
   1221     push        rsi
   1222     push        rdi
   1223     ; end prolog
   1224 
   1225     ALIGN_STACK 16, rax
   1226     sub          rsp, lf_var_size
   1227 
   1228         mov         rsi,                arg(0)              ; u_ptr
   1229         movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
   1230 
   1231         lea         rsi,                [rsi - 4]
   1232         lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
   1233         lea         rcx,                [rax+2*rax]
   1234 
   1235         ; Transpose
   1236         TRANSPOSE_16X8 0, 0
   1237 
   1238         ; calculate filter mask and high edge variance
   1239         LFV_FILTER_MASK_HEV_MASK
   1240 
   1241         ; start work on filters
   1242         MB_FILTER_AND_WRITEBACK 2
   1243 
   1244         ; transpose and write back
   1245         MBV_TRANSPOSE
   1246 
   1247         mov         rsi,                arg(0)             ;u_ptr
   1248         lea         rsi,                [rsi - 4]
   1249         lea         rdi,                [rsi + rax]
   1250         MBV_WRITEBACK_1
   1251         mov         rsi,                arg(5)             ;v_ptr
   1252         lea         rsi,                [rsi - 4]
   1253         lea         rdi,                [rsi + rax]
   1254         MBV_WRITEBACK_2
   1255 
   1256     add rsp, lf_var_size
   1257     pop rsp
   1258     ; begin epilog
   1259     pop rdi
   1260     pop rsi
   1261     RESTORE_GOT
   1262     RESTORE_XMM
   1263     UNSHADOW_ARGS
   1264     pop         rbp
   1265     ret
   1266 
   1267 
   1268 ;void vp8_loop_filter_simple_horizontal_edge_sse2
   1269 ;(
   1270 ;    unsigned char *src_ptr,
   1271 ;    int  src_pixel_step,
   1272 ;    const char *blimit,
   1273 ;)
   1274 global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE
   1275 sym(vp8_loop_filter_simple_horizontal_edge_sse2):
   1276     push        rbp
   1277     mov         rbp, rsp
   1278     SHADOW_ARGS_TO_STACK 3
   1279     SAVE_XMM 7
   1280     GET_GOT     rbx
   1281     ; end prolog
   1282 
   1283         mov         rcx, arg(0)             ;src_ptr
   1284         movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
   1285         movdqa      xmm6, [GLOBAL(tfe)]
   1286         lea         rdx, [rcx + rax]
   1287         neg         rax
   1288 
   1289         ; calculate mask
   1290         movdqa      xmm0, [rdx]             ; q1
   1291         mov         rdx, arg(2)             ;blimit
   1292         movdqa      xmm1, [rcx+2*rax]       ; p1
   1293 
   1294         movdqa      xmm2, xmm1
   1295         movdqa      xmm3, xmm0
   1296 
   1297         psubusb     xmm0, xmm1              ; q1-=p1
   1298         psubusb     xmm1, xmm3              ; p1-=q1
   1299         por         xmm1, xmm0              ; abs(p1-q1)
   1300         pand        xmm1, xmm6              ; set lsb of each byte to zero
   1301         psrlw       xmm1, 1                 ; abs(p1-q1)/2
   1302 
   1303         movdqa      xmm7, XMMWORD PTR [rdx]
   1304 
   1305         movdqa      xmm5, [rcx+rax]         ; p0
   1306         movdqa      xmm4, [rcx]             ; q0
   1307         movdqa      xmm0, xmm4              ; q0
   1308         movdqa      xmm6, xmm5              ; p0
   1309         psubusb     xmm5, xmm4              ; p0-=q0
   1310         psubusb     xmm4, xmm6              ; q0-=p0
   1311         por         xmm5, xmm4              ; abs(p0 - q0)
   1312 
   1313         movdqa      xmm4, [GLOBAL(t80)]
   1314 
   1315         paddusb     xmm5, xmm5              ; abs(p0-q0)*2
   1316         paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
   1317         psubusb     xmm5, xmm7              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
   1318         pxor        xmm7, xmm7
   1319         pcmpeqb     xmm5, xmm7
   1320 
   1321 
   1322         ; start work on filters
   1323         pxor        xmm2, xmm4     ; p1 offset to convert to signed values
   1324         pxor        xmm3, xmm4     ; q1 offset to convert to signed values
   1325         psubsb      xmm2, xmm3              ; p1 - q1
   1326 
   1327         pxor        xmm6, xmm4     ; offset to convert to signed values
   1328         pxor        xmm0, xmm4     ; offset to convert to signed values
   1329         movdqa      xmm3, xmm0              ; q0
   1330         psubsb      xmm0, xmm6              ; q0 - p0
   1331         paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
   1332         paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
   1333         paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
   1334         pand        xmm5, xmm2              ; mask filter values we don't care about
   1335 
   1336         movdqa      xmm0, xmm5
   1337         paddsb      xmm5,        [GLOBAL(t3)]                  ;  3* (q0 - p0) + (p1 - q1) + 4
   1338         paddsb      xmm0,        [GLOBAL(t4)]                  ; +3 instead of +4
   1339 
   1340         movdqa      xmm1, [GLOBAL(te0)]
   1341         movdqa      xmm2, [GLOBAL(t1f)]
   1342 
   1343 ;        pxor        xmm7, xmm7
   1344         pcmpgtb     xmm7, xmm0              ;save sign
   1345         pand        xmm7, xmm1              ;preserve the upper 3 bits
   1346         psrlw       xmm0, 3
   1347         pand        xmm0, xmm2              ;clear out upper 3 bits
   1348         por         xmm0, xmm7              ;add sign
   1349         psubsb      xmm3, xmm0              ; q0-= q0sz add
   1350 
   1351         pxor        xmm7, xmm7
   1352         pcmpgtb     xmm7, xmm5              ;save sign
   1353         pand        xmm7, xmm1              ;preserve the upper 3 bits
   1354         psrlw       xmm5, 3
   1355         pand        xmm5, xmm2              ;clear out upper 3 bits
   1356         por         xmm5, xmm7              ;add sign
   1357         paddsb      xmm6, xmm5              ; p0+= p0 add
   1358 
   1359         pxor        xmm3, xmm4     ; unoffset
   1360         movdqa      [rcx], xmm3             ; write back
   1361 
   1362         pxor        xmm6, xmm4     ; unoffset
   1363         movdqa      [rcx+rax], xmm6         ; write back
   1364 
   1365     ; begin epilog
   1366     RESTORE_GOT
   1367     RESTORE_XMM
   1368     UNSHADOW_ARGS
   1369     pop         rbp
   1370     ret
   1371 
   1372 
   1373 ;void vp8_loop_filter_simple_vertical_edge_sse2
   1374 ;(
   1375 ;    unsigned char *src_ptr,
   1376 ;    int  src_pixel_step,
   1377 ;    const char *blimit,
   1378 ;)
   1379 global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE
   1380 sym(vp8_loop_filter_simple_vertical_edge_sse2):
   1381     push        rbp         ; save old base pointer value.
   1382     mov         rbp, rsp    ; set new base pointer value.
   1383     SHADOW_ARGS_TO_STACK 3
   1384     SAVE_XMM 7
   1385     GET_GOT     rbx         ; save callee-saved reg
   1386     push        rsi
   1387     push        rdi
   1388     ; end prolog
   1389 
   1390     ALIGN_STACK 16, rax
   1391     sub         rsp, 32                         ; reserve 32 bytes
   1392     %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
   1393     %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
   1394 
   1395         mov         rsi, arg(0) ;src_ptr
   1396         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
   1397 
   1398         lea         rsi,        [rsi - 2 ]
   1399         lea         rdi,        [rsi + rax]
   1400         lea         rdx,        [rsi + rax*4]
   1401         lea         rcx,        [rdx + rax]
   1402 
   1403         movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
   1404         movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
   1405         movd        xmm2,       [rdi]                   ; 13 12 11 10
   1406         movd        xmm3,       [rcx]                   ; 53 52 51 50
   1407         punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
   1408         punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
   1409 
   1410         movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20
   1411         movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60
   1412         movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30
   1413         movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70
   1414         punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
   1415         punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
   1416 
   1417         punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
   1418         punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
   1419 
   1420         movdqa      xmm1,       xmm0
   1421         punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
   1422         punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
   1423 
   1424         movdqa      xmm2,       xmm0
   1425         punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
   1426         punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
   1427 
   1428         lea         rsi,        [rsi + rax*8]
   1429         lea         rdi,        [rsi + rax]
   1430         lea         rdx,        [rsi + rax*4]
   1431         lea         rcx,        [rdx + rax]
   1432 
   1433         movd        xmm4,       [rsi]                   ; 83 82 81 80
   1434         movd        xmm1,       [rdx]                   ; c3 c2 c1 c0
   1435         movd        xmm6,       [rdi]                   ; 93 92 91 90
   1436         movd        xmm3,       [rcx]                   ; d3 d2 d1 d0
   1437         punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
   1438         punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
   1439 
   1440         movd        xmm1,       [rsi + rax*2]           ; a3 a2 a1 a0
   1441         movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
   1442         movd        xmm3,       [rdi + rax*2]           ; b3 b2 b1 b0
   1443         movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
   1444         punpckldq   xmm1,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
   1445         punpckldq   xmm3,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
   1446 
   1447         punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
   1448         punpcklbw   xmm1,       xmm3                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
   1449 
   1450         movdqa      xmm7,       xmm4
   1451         punpcklwd   xmm4,       xmm1                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
   1452         punpckhwd   xmm7,       xmm1                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
   1453 
   1454         movdqa      xmm6,       xmm4
   1455         punpckldq   xmm4,       xmm7                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
   1456         punpckhdq   xmm6,       xmm7                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
   1457 
   1458         movdqa      xmm1,       xmm0
   1459         movdqa      xmm3,       xmm2
   1460 
   1461         punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
   1462         punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
   1463         punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1464         punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
   1465 
   1466         mov         rdx,        arg(2)                          ;blimit
   1467 
   1468         ; calculate mask
   1469         movdqa      xmm6,       xmm0                            ; p1
   1470         movdqa      xmm7,       xmm3                            ; q1
   1471         psubusb     xmm7,       xmm0                            ; q1-=p1
   1472         psubusb     xmm6,       xmm3                            ; p1-=q1
   1473         por         xmm6,       xmm7                            ; abs(p1-q1)
   1474         pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
   1475         psrlw       xmm6,       1                               ; abs(p1-q1)/2
   1476 
   1477         movdqa      xmm7, [rdx]
   1478 
   1479         movdqa      xmm5,       xmm1                            ; p0
   1480         movdqa      xmm4,       xmm2                            ; q0
   1481         psubusb     xmm5,       xmm2                            ; p0-=q0
   1482         psubusb     xmm4,       xmm1                            ; q0-=p0
   1483         por         xmm5,       xmm4                            ; abs(p0 - q0)
   1484         paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
   1485         paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
   1486 
   1487         movdqa      xmm4, [GLOBAL(t80)]
   1488 
   1489         psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
   1490         pxor        xmm7,        xmm7
   1491         pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
   1492 
   1493         ; start work on filters
   1494         movdqa        t0,        xmm0
   1495         movdqa        t1,        xmm3
   1496 
   1497         pxor        xmm0,        xmm4                  ; p1 offset to convert to signed values
   1498         pxor        xmm3,        xmm4                  ; q1 offset to convert to signed values
   1499         psubsb      xmm0,        xmm3                           ; p1 - q1
   1500 
   1501         pxor        xmm1,        xmm4                  ; offset to convert to signed values
   1502         pxor        xmm2,        xmm4                  ; offset to convert to signed values
   1503 
   1504         movdqa      xmm3,        xmm2                           ; offseted ; q0
   1505         psubsb      xmm2,        xmm1                           ; q0 - p0
   1506         paddsb      xmm0,        xmm2                           ; p1 - q1 + 1 * (q0 - p0)
   1507         paddsb      xmm0,        xmm2                           ; p1 - q1 + 2 * (q0 - p0)
   1508         paddsb      xmm0,        xmm2                           ; p1 - q1 + 3 * (q0 - p0)
   1509         pand        xmm5,        xmm0                           ; mask filter values we don't care about
   1510 
   1511         movdqa      xmm0, xmm5
   1512         paddsb      xmm5,        [GLOBAL(t3)]                  ;  3* (q0 - p0) + (p1 - q1) + 4
   1513         paddsb      xmm0,        [GLOBAL(t4)]                  ; +3 instead of +4
   1514 
   1515         movdqa  xmm6, [GLOBAL(te0)]
   1516         movdqa  xmm2, [GLOBAL(t1f)]
   1517 
   1518 ;        pxor        xmm7, xmm7
   1519         pcmpgtb     xmm7, xmm0              ;save sign
   1520         pand        xmm7, xmm6              ;preserve the upper 3 bits
   1521         psrlw       xmm0, 3
   1522         pand        xmm0, xmm2              ;clear out upper 3 bits
   1523         por         xmm0, xmm7              ;add sign
   1524         psubsb      xmm3, xmm0              ; q0-= q0sz add
   1525 
   1526         pxor        xmm7, xmm7
   1527         pcmpgtb     xmm7, xmm5              ;save sign
   1528         pand        xmm7, xmm6              ;preserve the upper 3 bits
   1529         psrlw       xmm5, 3
   1530         pand        xmm5, xmm2              ;clear out upper 3 bits
   1531         por         xmm5, xmm7              ;add sign
   1532         paddsb      xmm1, xmm5              ; p0+= p0 add
   1533 
   1534         pxor        xmm3,        xmm4                  ; unoffset   q0
   1535         pxor        xmm1,        xmm4                  ; unoffset   p0
   1536 
   1537         movdqa      xmm0,        t0                             ; p1
   1538         movdqa      xmm4,        t1                             ; q1
   1539 
   1540         ; write out order: xmm0 xmm2 xmm1 xmm3
   1541         lea         rdx,        [rsi + rax*4]
   1542 
   1543         ; transpose back to write out
   1544         ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
   1545         ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
   1546         ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1547         ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
   1548         movdqa      xmm6,       xmm0
   1549         punpcklbw   xmm0,       xmm1                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
   1550         punpckhbw   xmm6,       xmm1                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
   1551 
   1552         movdqa      xmm5,       xmm3
   1553         punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
   1554         punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
   1555 
   1556         movdqa      xmm2,       xmm0
   1557         punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
   1558         punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
   1559 
   1560         movdqa      xmm3,       xmm6
   1561         punpcklwd   xmm6,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
   1562         punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
   1563 
   1564         movd        [rsi],      xmm6                               ; write the second 8-line result
   1565         movd        [rdx],      xmm3
   1566         psrldq      xmm6,       4
   1567         psrldq      xmm3,       4
   1568         movd        [rdi],      xmm6
   1569         movd        [rcx],      xmm3
   1570         psrldq      xmm6,       4
   1571         psrldq      xmm3,       4
   1572         movd        [rsi + rax*2], xmm6
   1573         movd        [rdx + rax*2], xmm3
   1574         psrldq      xmm6,       4
   1575         psrldq      xmm3,       4
   1576         movd        [rdi + rax*2], xmm6
   1577         movd        [rcx + rax*2], xmm3
   1578 
   1579         neg         rax
   1580         lea         rsi,        [rsi + rax*8]
   1581         neg         rax
   1582         lea         rdi,        [rsi + rax]
   1583         lea         rdx,        [rsi + rax*4]
   1584         lea         rcx,        [rdx + rax]
   1585 
   1586         movd        [rsi],      xmm0                                ; write the first 8-line result
   1587         movd        [rdx],      xmm2
   1588         psrldq      xmm0,       4
   1589         psrldq      xmm2,       4
   1590         movd        [rdi],      xmm0
   1591         movd        [rcx],      xmm2
   1592         psrldq      xmm0,       4
   1593         psrldq      xmm2,       4
   1594         movd        [rsi + rax*2], xmm0
   1595         movd        [rdx + rax*2], xmm2
   1596         psrldq      xmm0,       4
   1597         psrldq      xmm2,       4
   1598         movd        [rdi + rax*2], xmm0
   1599         movd        [rcx + rax*2], xmm2
   1600 
   1601     add rsp, 32
   1602     pop rsp
   1603     ; begin epilog
   1604     pop rdi
   1605     pop rsi
   1606     RESTORE_GOT
   1607     RESTORE_XMM
   1608     UNSHADOW_ARGS
   1609     pop         rbp
   1610     ret
   1611 
   1612 SECTION_RODATA
   1613 align 16
   1614 tfe:
   1615     times 16 db 0xfe
   1616 align 16
   1617 t80:
   1618     times 16 db 0x80
   1619 align 16
   1620 t1s:
   1621     times 16 db 0x01
   1622 align 16
   1623 t3:
   1624     times 16 db 0x03
   1625 align 16
   1626 t4:
   1627     times 16 db 0x04
   1628 align 16
   1629 ones:
   1630     times 8 dw 0x0001
   1631 align 16
   1632 s9:
   1633     times 8 dw 0x0900
   1634 align 16
   1635 s63:
   1636     times 8 dw 0x003f
   1637 align 16
   1638 te0:
   1639     times 16 db 0xe0
   1640 align 16
   1641 t1f:
   1642     times 16 db 0x1f
   1643