Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ; Use of pmaxub instead of psubusb to compute filter mask was seen
     15 ; in ffvp8
     16 
     17 %macro LFH_FILTER_AND_HEV_MASK 1
     18 %if %1
     19         movdqa      xmm2,                   [rdi+2*rax]       ; q3
     20         movdqa      xmm1,                   [rsi+2*rax]       ; q2
     21         movdqa      xmm4,                   [rsi+rax]         ; q1
     22         movdqa      xmm5,                   [rsi]             ; q0
     23         neg         rax                     ; negate pitch to deal with above border
     24 %else
     25         movlps      xmm2,                   [rsi + rcx*2]     ; q3
     26         movlps      xmm1,                   [rsi + rcx]       ; q2
     27         movlps      xmm4,                   [rsi]             ; q1
     28         movlps      xmm5,                   [rsi + rax]       ; q0
     29 
     30         movhps      xmm2,                   [rdi + rcx*2]
     31         movhps      xmm1,                   [rdi + rcx]
     32         movhps      xmm4,                   [rdi]
     33         movhps      xmm5,                   [rdi + rax]
     34 
     35         lea         rsi,                    [rsi + rax*4]
     36         lea         rdi,                    [rdi + rax*4]
     37 
     38         movdqa      XMMWORD PTR [rsp],      xmm1              ; store q2
     39         movdqa      XMMWORD PTR [rsp + 16], xmm4              ; store q1
     40 %endif
     41 
     42         movdqa      xmm6,                   xmm1              ; q2
     43         movdqa      xmm3,                   xmm4              ; q1
     44 
     45         psubusb     xmm1,                   xmm2              ; q2-=q3
     46         psubusb     xmm2,                   xmm6              ; q3-=q2
     47 
     48         psubusb     xmm4,                   xmm6              ; q1-=q2
     49         psubusb     xmm6,                   xmm3              ; q2-=q1
     50 
     51         por         xmm4,                   xmm6              ; abs(q2-q1)
     52         por         xmm1,                   xmm2              ; abs(q3-q2)
     53 
     54         movdqa      xmm0,                   xmm5              ; q0
     55         pmaxub      xmm1,                   xmm4
     56 
     57         psubusb     xmm5,                   xmm3              ; q0-=q1
     58         psubusb     xmm3,                   xmm0              ; q1-=q0
     59 
     60         por         xmm5,                   xmm3              ; abs(q0-q1)
     61         movdqa      t0,                     xmm5              ; save to t0
     62 
     63         pmaxub      xmm1,                   xmm5
     64 
     65 %if %1
     66         movdqa      xmm2,                   [rsi+4*rax]       ; p3
     67         movdqa      xmm4,                   [rdi+4*rax]       ; p2
     68         movdqa      xmm6,                   [rsi+2*rax]       ; p1
     69 %else
     70         movlps      xmm2,                   [rsi + rax]       ; p3
     71         movlps      xmm4,                   [rsi]             ; p2
     72         movlps      xmm6,                   [rsi + rcx]       ; p1
     73 
     74         movhps      xmm2,                   [rdi + rax]
     75         movhps      xmm4,                   [rdi]
     76         movhps      xmm6,                   [rdi + rcx]
     77 
     78         movdqa      XMMWORD PTR [rsp + 32], xmm4              ; store p2
     79         movdqa      XMMWORD PTR [rsp + 48], xmm6              ; store p1
     80 %endif
     81 
     82         movdqa      xmm5,                   xmm4              ; p2
     83         movdqa      xmm3,                   xmm6              ; p1
     84 
     85         psubusb     xmm4,                   xmm2              ; p2-=p3
     86         psubusb     xmm2,                   xmm5              ; p3-=p2
     87 
     88         psubusb     xmm3,                   xmm5              ; p1-=p2
     89         pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
     90 
     91         psubusb     xmm5,                   xmm6              ; p2-=p1
     92         pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
     93 
     94         pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
     95         movdqa      xmm2,                   xmm6              ; p1
     96 
     97         pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
     98 %if %1
     99         movdqa      xmm4,                   [rsi+rax]         ; p0
    100         movdqa      xmm3,                   [rdi]             ; q1
    101 %else
    102         movlps      xmm4,                   [rsi + rcx*2]     ; p0
    103         movhps      xmm4,                   [rdi + rcx*2]
    104         movdqa      xmm3,                   q1                ; q1
    105 %endif
    106 
    107         movdqa      xmm5,                   xmm4              ; p0
    108         psubusb     xmm4,                   xmm6              ; p0-=p1
    109 
    110         psubusb     xmm6,                   xmm5              ; p1-=p0
    111 
    112         por         xmm6,                   xmm4              ; abs(p1 - p0)
    113         mov         rdx,                    arg(2)            ; get flimit
    114 
    115         movdqa        t1,                   xmm6              ; save to t1
    116 
    117         movdqa      xmm4,                   xmm3              ; q1
    118         pmaxub      xmm1,                   xmm6
    119 
    120         psubusb     xmm3,                   xmm2              ; q1-=p1
    121         psubusb     xmm2,                   xmm4              ; p1-=q1
    122 
    123         psubusb     xmm1,                   xmm7
    124         por         xmm2,                   xmm3              ; abs(p1-q1)
    125 
    126         movdqa      xmm4,                   XMMWORD PTR [rdx] ; flimit
    127 
    128         movdqa      xmm3,                   xmm0              ; q0
    129         pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
    130 
    131         mov         rdx,                    arg(4)            ; hev get thresh
    132 
    133         movdqa      xmm6,                   xmm5              ; p0
    134         psrlw       xmm2,                   1                 ; abs(p1-q1)/2
    135 
    136         psubusb     xmm5,                   xmm3              ; p0-=q0
    137         paddb       xmm4,                   xmm4              ; flimit*2 (less than 255)
    138 
    139         psubusb     xmm3,                   xmm6              ; q0-=p0
    140         por         xmm5,                   xmm3              ; abs(p0 - q0)
    141 
    142         paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
    143         paddb       xmm7,                   xmm4              ; flimit * 2 + limit (less than 255)
    144 
    145         movdqa      xmm4,                   t0                ; hev get abs (q1 - q0)
    146 
    147         movdqa      xmm3,                   t1                ; get abs (p1 - p0)
    148 
    149         paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
    150 
    151         movdqa      xmm2,                   XMMWORD PTR [rdx] ; hev
    152 
    153         psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
    154         psubusb     xmm4,                   xmm2              ; hev
    155 
    156         psubusb     xmm3,                   xmm2              ; hev
    157         por         xmm1,                   xmm5
    158 
    159         pxor        xmm7,                   xmm7
    160         paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
    161 
    162         pcmpeqb     xmm4,                   xmm5              ; hev
    163         pcmpeqb     xmm3,                   xmm3              ; hev
    164 
    165         pcmpeqb     xmm1,                   xmm7              ; mask xmm1
    166         pxor        xmm4,                   xmm3              ; hev
    167 %endmacro
    168 
    169 %macro B_FILTER 1
    170 %if %1 == 0
    171         movdqa      xmm2,                   p1                ; p1
    172         movdqa      xmm7,                   q1                ; q1
    173 %elif %1 == 1
    174         movdqa      xmm2,                   [rsi+2*rax]       ; p1
    175         movdqa      xmm7,                   [rdi]             ; q1
    176 %elif %1 == 2
    177         lea         rdx,                    srct
    178 
    179         movdqa      xmm2,                   [rdx]             ; p1
    180         movdqa      xmm7,                   [rdx+48]          ; q1
    181         movdqa      xmm6,                   [rdx+16]          ; p0
    182         movdqa      xmm0,                   [rdx+32]          ; q0
    183 %endif
    184 
    185         pxor        xmm2,                   [GLOBAL(t80)]     ; p1 offset to convert to signed values
    186         pxor        xmm7,                   [GLOBAL(t80)]     ; q1 offset to convert to signed values
    187 
    188         psubsb      xmm2,                   xmm7              ; p1 - q1
    189         pxor        xmm6,                   [GLOBAL(t80)]     ; offset to convert to signed values
    190 
    191         pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
    192         pxor        xmm0,                   [GLOBAL(t80)]     ; offset to convert to signed values
    193 
    194         movdqa      xmm3,                   xmm0              ; q0
    195         psubsb      xmm0,                   xmm6              ; q0 - p0
    196 
    197         paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
    198 
    199         paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
    200 
    201         paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
    202 
    203         pand        xmm1,                   xmm2              ; mask filter values we don't care about
    204 
    205         movdqa      xmm2,                   xmm1
    206 
    207         paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
    208         paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
    209 
    210         punpckhbw   xmm5,                   xmm2              ; axbxcxdx
    211         punpcklbw   xmm2,                   xmm2              ; exfxgxhx
    212 
    213         punpcklbw   xmm0,                   xmm1              ; exfxgxhx
    214         psraw       xmm5,                   11                ; sign extended shift right by 3
    215 
    216         punpckhbw   xmm1,                   xmm1              ; axbxcxdx
    217         psraw       xmm2,                   11                ; sign extended shift right by 3
    218 
    219         packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
    220         psraw       xmm0,                   11                ; sign extended shift right by 3
    221 
    222         psraw       xmm1,                   11                ; sign extended shift right by 3
    223         movdqa      xmm5,                   xmm0              ; save results
    224 
    225         packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
    226         paddsw      xmm5,                   [GLOBAL(ones)]
    227 
    228         paddsw      xmm1,                   [GLOBAL(ones)]
    229         psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
    230 
    231         psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
    232 
    233         paddsb      xmm6,                   xmm2              ; p0+= p0 add
    234         packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
    235 
    236 %if %1 == 0
    237         movdqa      xmm1,                   p1                ; p1
    238 %elif %1 == 1
    239         movdqa      xmm1,                   [rsi+2*rax]       ; p1
    240 %elif %1 == 2
    241         movdqa      xmm1,                   [rdx]             ; p1
    242 %endif
    243         pandn       xmm4,                   xmm5              ; high edge variance additive
    244         pxor        xmm6,                   [GLOBAL(t80)]     ; unoffset
    245 
    246         pxor        xmm1,                   [GLOBAL(t80)]     ; reoffset
    247         psubsb      xmm3,                   xmm0              ; q0-= q0 add
    248 
    249         paddsb      xmm1,                   xmm4              ; p1+= p1 add
    250         pxor        xmm3,                   [GLOBAL(t80)]     ; unoffset
    251 
    252         pxor        xmm1,                   [GLOBAL(t80)]     ; unoffset
    253         psubsb      xmm7,                   xmm4              ; q1-= q1 add
    254 
    255         pxor        xmm7,                   [GLOBAL(t80)]     ; unoffset
    256 %if %1 == 0
    257         lea         rsi,                    [rsi + rcx*2]
    258         lea         rdi,                    [rdi + rcx*2]
    259         movq        MMWORD PTR [rsi],       xmm6              ; p0
    260         movhps      MMWORD PTR [rdi],       xmm6
    261         movq        MMWORD PTR [rsi + rax], xmm1              ; p1
    262         movhps      MMWORD PTR [rdi + rax], xmm1
    263         movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
    264         movhps      MMWORD PTR [rdi + rcx], xmm3
    265         movq        MMWORD PTR [rsi + rcx*2],xmm7             ; q1
    266         movhps      MMWORD PTR [rdi + rcx*2],xmm7
    267 %elif %1 == 1
    268         movdqa      [rsi+rax],              xmm6              ; write back
    269         movdqa      [rsi+2*rax],            xmm1              ; write back
    270         movdqa      [rsi],                  xmm3              ; write back
    271         movdqa      [rdi],                  xmm7              ; write back
    272 %endif
    273 
    274 %endmacro
    275 
    276 
    277 ;void vp8_loop_filter_horizontal_edge_sse2
    278 ;(
    279 ;    unsigned char *src_ptr,
    280 ;    int            src_pixel_step,
    281 ;    const char    *flimit,
    282 ;    const char    *limit,
    283 ;    const char    *thresh,
    284 ;    int            count
    285 ;)
    286 global sym(vp8_loop_filter_horizontal_edge_sse2)
    287 sym(vp8_loop_filter_horizontal_edge_sse2):
    288     push        rbp
    289     mov         rbp, rsp
    290     SHADOW_ARGS_TO_STACK 6
    291     SAVE_XMM
    292     GET_GOT     rbx
    293     push        rsi
    294     push        rdi
    295     ; end prolog
    296 
    297     ALIGN_STACK 16, rax
    298     sub         rsp, 32     ; reserve 32 bytes
    299     %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
    300     %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
    301 
    302         mov         rsi,                    arg(0)           ;src_ptr
    303         movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
    304 
    305         mov         rdx,                    arg(3)           ;limit
    306         movdqa      xmm7,                   XMMWORD PTR [rdx]
    307 
    308         lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
    309 
    310         ; calculate breakout conditions and high edge variance
    311         LFH_FILTER_AND_HEV_MASK 1
    312         ; filter and write back the result
    313         B_FILTER 1
    314 
    315     add rsp, 32
    316     pop rsp
    317     ; begin epilog
    318     pop rdi
    319     pop rsi
    320     RESTORE_GOT
    321     RESTORE_XMM
    322     UNSHADOW_ARGS
    323     pop         rbp
    324     ret
    325 
    326 
    327 ;void vp8_loop_filter_horizontal_edge_uv_sse2
    328 ;(
    329 ;    unsigned char *src_ptr,
    330 ;    int            src_pixel_step,
    331 ;    const char    *flimit,
    332 ;    const char    *limit,
    333 ;    const char    *thresh,
    334 ;    int            count
    335 ;)
    336 global sym(vp8_loop_filter_horizontal_edge_uv_sse2)
    337 sym(vp8_loop_filter_horizontal_edge_uv_sse2):
    338     push        rbp
    339     mov         rbp, rsp
    340     SHADOW_ARGS_TO_STACK 6
    341     SAVE_XMM
    342     GET_GOT     rbx
    343     push        rsi
    344     push        rdi
    345     ; end prolog
    346 
    347     ALIGN_STACK 16, rax
    348     sub         rsp, 96       ; reserve 96 bytes
    349     %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
    350     %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
    351     %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
    352     %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
    353     %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
    354     %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
    355 
    356         mov         rsi,                    arg(0)             ; u
    357         mov         rdi,                    arg(5)             ; v
    358         movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
    359         mov         rcx,                    rax
    360         neg         rax                     ; negate pitch to deal with above border
    361 
    362         mov         rdx,                    arg(3)             ;limit
    363         movdqa      xmm7,                   XMMWORD PTR [rdx]
    364 
    365         lea         rsi,                    [rsi + rcx]
    366         lea         rdi,                    [rdi + rcx]
    367 
    368         ; calculate breakout conditions and high edge variance
    369         LFH_FILTER_AND_HEV_MASK 0
    370         ; filter and write back the result
    371         B_FILTER 0
    372 
    373     add rsp, 96
    374     pop rsp
    375     ; begin epilog
    376     pop rdi
    377     pop rsi
    378     RESTORE_GOT
    379     RESTORE_XMM
    380     UNSHADOW_ARGS
    381     pop         rbp
    382     ret
    383 
    384 
    385 %macro MB_FILTER_AND_WRITEBACK 1
    386 %if %1 == 0
    387         movdqa      xmm2,                   p1              ; p1
    388         movdqa      xmm7,                   q1              ; q1
    389 %elif %1 == 1
    390         movdqa      xmm2,                   [rsi+2*rax]     ; p1
    391         movdqa      xmm7,                   [rdi]           ; q1
    392 
    393         mov         rcx,                    rax
    394         neg         rcx
    395 %elif %1 == 2
    396         lea         rdx,                    srct
    397 
    398         movdqa      xmm2,                   [rdx+32]        ; p1
    399         movdqa      xmm7,                   [rdx+80]        ; q1
    400         movdqa      xmm6,                   [rdx+48]        ; p0
    401         movdqa      xmm0,                   [rdx+64]        ; q0
    402 %endif
    403 
    404         pxor        xmm2,                   [GLOBAL(t80)]   ; p1 offset to convert to signed values
    405         pxor        xmm7,                   [GLOBAL(t80)]   ; q1 offset to convert to signed values
    406         pxor        xmm6,                   [GLOBAL(t80)]   ; offset to convert to signed values
    407         pxor        xmm0,                   [GLOBAL(t80)]   ; offset to convert to signed values
    408 
    409         psubsb      xmm2,                   xmm7            ; p1 - q1
    410         movdqa      xmm3,                   xmm0            ; q0
    411 
    412         psubsb      xmm0,                   xmm6            ; q0 - p0
    413 
    414         paddsb      xmm2,                   xmm0            ; 1 * (q0 - p0) + (p1 - q1)
    415 
    416         paddsb      xmm2,                   xmm0            ; 2 * (q0 - p0)
    417 
    418         paddsb      xmm2,                   xmm0            ; 3 * (q0 - p0) + (p1 - q1)
    419 
    420         pand        xmm1,                   xmm2            ; mask filter values we don't care about
    421 
    422         movdqa      xmm2,                   xmm1            ; vp8_filter
    423 
    424         pand        xmm2,                   xmm4            ; Filter2 = vp8_filter & hev
    425         pxor        xmm0,                   xmm0
    426 
    427         pandn       xmm4,                   xmm1            ; vp8_filter&=~hev
    428         pxor        xmm1,                   xmm1
    429 
    430         punpcklbw   xmm0,                   xmm4            ; Filter 2 (hi)
    431         movdqa      xmm5,                   xmm2
    432 
    433         punpckhbw   xmm1,                   xmm4            ; Filter 2 (lo)
    434         paddsb      xmm5,                   [GLOBAL(t3)]    ; vp8_signed_char_clamp(Filter2 + 3)
    435 
    436         pmulhw      xmm1,                   [GLOBAL(s9)]    ; Filter 2 (lo) * 9
    437 
    438         pmulhw      xmm0,                   [GLOBAL(s9)]    ; Filter 2 (hi) * 9
    439 
    440         punpckhbw   xmm7,                   xmm5            ; axbxcxdx
    441         paddsb      xmm2,                   [GLOBAL(t4)]    ; vp8_signed_char_clamp(Filter2 + 4)
    442 
    443         punpcklbw   xmm5,                   xmm5            ; exfxgxhx
    444         psraw       xmm7,                   11              ; sign extended shift right by 3
    445 
    446         psraw       xmm5,                   11              ; sign extended shift right by 3
    447         punpckhbw   xmm4,                   xmm2            ; axbxcxdx
    448 
    449         punpcklbw   xmm2,                   xmm2            ; exfxgxhx
    450         psraw       xmm4,                   11              ; sign extended shift right by 3
    451 
    452         packsswb    xmm5,                   xmm7            ; Filter2 >>=3;
    453         psraw       xmm2,                   11              ; sign extended shift right by 3
    454 
    455         packsswb    xmm2,                   xmm4            ; Filter1 >>=3;
    456         movdqa      xmm7,                   xmm1
    457 
    458         paddsb      xmm6,                   xmm5            ; ps0 =ps0 + Fitler2
    459         movdqa      xmm4,                   xmm1
    460 
    461         psubsb      xmm3,                   xmm2            ; qs0 =qs0 - Filter1
    462         movdqa      xmm5,                   xmm0
    463 
    464         movdqa      xmm2,                   xmm5
    465         paddw       xmm0,                   [GLOBAL(s63)]   ; Filter 2 (hi) * 9 + 63
    466 
    467         paddw       xmm1,                   [GLOBAL(s63)]   ; Filter 2 (lo) * 9 + 63
    468         paddw       xmm5,                   xmm5            ; Filter 2 (hi) * 18
    469 
    470         paddw       xmm7,                   xmm7            ; Filter 2 (lo) * 18
    471         paddw       xmm5,                   xmm0            ; Filter 2 (hi) * 27 + 63
    472 
    473         paddw       xmm7,                   xmm1            ; Filter 2 (lo) * 27 + 63
    474         paddw       xmm2,                   xmm0            ; Filter 2 (hi) * 18 + 63
    475 
    476         paddw       xmm4,                   xmm1            ; Filter 2 (lo) * 18 + 63
    477         psraw       xmm0,                   7               ; (Filter 2 (hi) * 9 + 63) >> 7
    478 
    479         psraw       xmm1,                   7               ; (Filter 2 (lo) * 9 + 63) >> 7
    480         psraw       xmm2,                   7               ; (Filter 2 (hi) * 18 + 63) >> 7
    481 
    482         packsswb    xmm0,                   xmm1            ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
    483         psraw       xmm4,                   7               ; (Filter 2 (lo) * 18 + 63) >> 7
    484 
    485         psraw       xmm5,                   7               ; (Filter 2 (hi) * 27 + 63) >> 7
    486         packsswb    xmm2,                   xmm4            ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
    487 
    488         psraw       xmm7,                   7               ; (Filter 2 (lo) * 27 + 63) >> 7
    489 
    490         packsswb    xmm5,                   xmm7            ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
    491 
    492         psubsb      xmm3,                   xmm5            ; sq = vp8_signed_char_clamp(qs0 - u3)
    493         paddsb      xmm6,                   xmm5            ; sp = vp8_signed_char_clamp(ps0 - u3)
    494 
    495 %if %1 == 0
    496         movdqa      xmm5,                   q2              ; q2
    497         movdqa      xmm1,                   q1              ; q1
    498         movdqa      xmm4,                   p1              ; p1
    499         movdqa      xmm7,                   p2              ; p2
    500 
    501 %elif %1 == 1
    502         movdqa      xmm5,                   XMMWORD PTR [rdi+rcx]   ; q2
    503         movdqa      xmm1,                   XMMWORD PTR [rdi]       ; q1
    504         movdqa      xmm4,                   XMMWORD PTR [rsi+rax*2] ; p1
    505         movdqa      xmm7,                   XMMWORD PTR [rdi+rax*4] ; p2
    506 %elif %1 == 2
    507         movdqa      xmm5,                   XMMWORD PTR [rdx+96]    ; q2
    508         movdqa      xmm1,                   XMMWORD PTR [rdx+80]    ; q1
    509         movdqa      xmm4,                   XMMWORD PTR [rdx+32]    ; p1
    510         movdqa      xmm7,                   XMMWORD PTR [rdx+16]    ; p2
    511 %endif
    512 
    513         pxor        xmm3,                   [GLOBAL(t80)]   ; *oq0 = sq^0x80
    514         pxor        xmm6,                   [GLOBAL(t80)]   ; *oq0 = sp^0x80
    515 
    516         pxor        xmm1,                   [GLOBAL(t80)]
    517         pxor        xmm4,                   [GLOBAL(t80)]
    518 
    519         psubsb      xmm1,                   xmm2            ; sq = vp8_signed_char_clamp(qs1 - u2)
    520         paddsb      xmm4,                   xmm2            ; sp = vp8_signed_char_clamp(ps1 - u2)
    521 
    522         pxor        xmm1,                   [GLOBAL(t80)]   ; *oq1 = sq^0x80;
    523         pxor        xmm4,                   [GLOBAL(t80)]   ; *op1 = sp^0x80;
    524 
    525         pxor        xmm7,                   [GLOBAL(t80)]
    526         pxor        xmm5,                   [GLOBAL(t80)]
    527 
    528         paddsb      xmm7,                   xmm0            ; sp = vp8_signed_char_clamp(ps2 - u)
    529         psubsb      xmm5,                   xmm0            ; sq = vp8_signed_char_clamp(qs2 - u)
    530 
    531         pxor        xmm7,                   [GLOBAL(t80)]   ; *op2 = sp^0x80;
    532         pxor        xmm5,                   [GLOBAL(t80)]   ; *oq2 = sq^0x80;
    533 
    534 %if %1 == 0
    535         lea         rsi,                    [rsi+rcx*2]
    536         lea         rdi,                    [rdi+rcx*2]
    537 
    538         movq        MMWORD PTR [rsi],       xmm6            ; p0
    539         movhps      MMWORD PTR [rdi],       xmm6
    540         movq        MMWORD PTR [rsi + rcx], xmm3            ; q0
    541         movhps      MMWORD PTR [rdi + rcx], xmm3
    542 
    543         movq        MMWORD PTR [rsi+rcx*2], xmm1            ; q1
    544         movhps      MMWORD PTR [rdi+rcx*2], xmm1
    545 
    546         movq        MMWORD PTR [rsi + rax], xmm4            ; p1
    547         movhps      MMWORD PTR [rdi + rax], xmm4
    548 
    549         movq        MMWORD PTR [rsi+rax*2], xmm7            ; p2
    550         movhps      MMWORD PTR [rdi+rax*2], xmm7
    551 
    552         lea         rsi,                    [rsi + rcx]
    553         lea         rdi,                    [rdi + rcx]
    554         movq        MMWORD PTR [rsi+rcx*2], xmm5            ; q2
    555         movhps      MMWORD PTR [rdi+rcx*2], xmm5
    556 %elif %1 == 1
    557         movdqa      XMMWORD PTR [rdi+rcx],  xmm5            ; q2
    558         movdqa      XMMWORD PTR [rdi],      xmm1            ; q1
    559         movdqa      XMMWORD PTR [rsi],      xmm3            ; q0
    560         movdqa      XMMWORD PTR [rsi+rax  ],xmm6            ; p0
    561         movdqa      XMMWORD PTR [rsi+rax*2],xmm4            ; p1
    562         movdqa      XMMWORD PTR [rdi+rax*4],xmm7            ; p2
    563 %elif %1 == 2
    564         movdqa      XMMWORD PTR [rdx+80],   xmm1            ; q1
    565         movdqa      XMMWORD PTR [rdx+64],   xmm3            ; q0
    566         movdqa      XMMWORD PTR [rdx+48],   xmm6            ; p0
    567         movdqa      XMMWORD PTR [rdx+32],   xmm4            ; p1
    568 %endif
    569 
    570 %endmacro
    571 
    572 
    573 ;void vp8_mbloop_filter_horizontal_edge_sse2
    574 ;(
    575 ;    unsigned char *src_ptr,
    576 ;    int            src_pixel_step,
    577 ;    const char    *flimit,
    578 ;    const char    *limit,
    579 ;    const char    *thresh,
    580 ;    int            count
    581 ;)
    582 global sym(vp8_mbloop_filter_horizontal_edge_sse2)
    583 sym(vp8_mbloop_filter_horizontal_edge_sse2):
    584     push        rbp
    585     mov         rbp, rsp
    586     SHADOW_ARGS_TO_STACK 6
    587     SAVE_XMM
    588     GET_GOT     rbx
    589     push        rsi
    590     push        rdi
    591     ; end prolog
    592 
    593     ALIGN_STACK 16, rax
    594     sub         rsp, 32     ; reserve 32 bytes
    595     %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
    596     %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
    597 
    598         mov         rsi,                    arg(0)            ;src_ptr
    599         movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step
    600 
    601         mov         rdx,                    arg(3)            ;limit
    602         movdqa      xmm7,                   XMMWORD PTR [rdx]
    603 
    604         lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing
    605 
    606         ; calculate breakout conditions and high edge variance
    607         LFH_FILTER_AND_HEV_MASK 1
    608         ; filter and write back the results
    609         MB_FILTER_AND_WRITEBACK 1
    610 
    611     add rsp, 32
    612     pop rsp
    613     ; begin epilog
    614     pop rdi
    615     pop rsi
    616     RESTORE_GOT
    617     RESTORE_XMM
    618     UNSHADOW_ARGS
    619     pop         rbp
    620     ret
    621 
    622 
    623 ;void vp8_mbloop_filter_horizontal_edge_uv_sse2
    624 ;(
    625 ;    unsigned char *u,
    626 ;    int            src_pixel_step,
    627 ;    const char    *flimit,
    628 ;    const char    *limit,
    629 ;    const char    *thresh,
    630 ;    unsigned char *v
    631 ;)
    632 global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2)
    633 sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
    634     push        rbp
    635     mov         rbp, rsp
    636     SHADOW_ARGS_TO_STACK 6
    637     SAVE_XMM
    638     GET_GOT     rbx
    639     push        rsi
    640     push        rdi
    641     ; end prolog
    642 
    643     ALIGN_STACK 16, rax
    644     sub         rsp, 96       ; reserve 96 bytes
    645     %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
    646     %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
    647     %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
    648     %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
    649     %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
    650     %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
    651 
    652         mov         rsi,                    arg(0)             ; u
    653         mov         rdi,                    arg(5)             ; v
    654         movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
    655         mov         rcx,                    rax
    656         neg         rax                     ; negate pitch to deal with above border
    657 
    658         mov         rdx,                    arg(3)             ;limit
    659         movdqa      xmm7,                   XMMWORD PTR [rdx]
    660 
    661         lea         rsi,                    [rsi + rcx]
    662         lea         rdi,                    [rdi + rcx]
    663 
    664         ; calculate breakout conditions and high edge variance
    665         LFH_FILTER_AND_HEV_MASK 0
    666         ; filter and write back the results
    667         MB_FILTER_AND_WRITEBACK 0
    668 
    669     add rsp, 96
    670     pop rsp
    671     ; begin epilog
    672     pop rdi
    673     pop rsi
    674     RESTORE_GOT
    675     RESTORE_XMM
    676     UNSHADOW_ARGS
    677     pop         rbp
    678     ret
    679 
    680 
    681 %macro TRANSPOSE_16X8 2
    682         movq        xmm4,               QWORD PTR [rsi]        ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
    683         movq        xmm1,               QWORD PTR [rdi]        ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
    684         movq        xmm0,               QWORD PTR [rsi+2*rax]  ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
    685         movq        xmm7,               QWORD PTR [rdi+2*rax]  ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
    686         movq        xmm5,               QWORD PTR [rsi+4*rax]  ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
    687         movq        xmm2,               QWORD PTR [rdi+4*rax]  ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
    688 
    689         punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
    690 
    691         movq        xmm1,               QWORD PTR [rdi+2*rcx]  ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
    692 
    693         movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
    694         punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
    695 
    696         movq        xmm7,               QWORD PTR [rsi+2*rcx]  ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
    697 
    698         punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
    699 %if %1
    700         lea         rsi,                [rsi+rax*8]
    701 %else
    702         mov         rsi,                arg(5)          ; v_ptr
    703 %endif
    704 
    705         movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
    706         punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
    707 
    708         punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
    709 
    710         punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
    711 %if %1
    712         lea         rdi,                [rdi+rax*8]
    713 %else
    714         lea         rsi,                [rsi - 4]
    715 %endif
    716 
    717         punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
    718 %if %1
    719         lea         rdx,                srct
    720 %else
    721         lea         rdi,                [rsi + rax]     ; rdi points to row +1 for indirect addressing
    722 %endif
    723 
    724         movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
    725         punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
    726 
    727         movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
    728         punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
    729 
    730         punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
    731 
    732         punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
    733 
    734         punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
    735 
    736         movdqa      t0,                 xmm2            ; save to free XMM2
    737         movq        xmm2,               QWORD PTR [rsi]       ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
    738         movq        xmm6,               QWORD PTR [rdi]       ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
    739         movq        xmm0,               QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
    740         movq        xmm5,               QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
    741         movq        xmm1,               QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
    742 
    743         punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
    744 
    745         movq        xmm6,               QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
    746 
    747         punpcklbw   xmm0,               xmm5                  ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
    748 
    749         movq        xmm5,               QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
    750 
    751         punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
    752 
    753         movq        xmm6,               QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
    754 
    755         punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
    756 
    757         movdqa      xmm6,               xmm1            ;
    758         punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
    759 
    760         punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
    761         movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
    762 
    763         punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
    764 
    765         punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
    766 
    767         movdqa      xmm0,               xmm5
    768         punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
    769 
    770         punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
    771         movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
    772 
    773         punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
    774 
    775         punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
    776         movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
    777 
    778         punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
    779 
    780         punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
    781 %if %2
    782         movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
    783         punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
    784 
    785         punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
    786 
    787         movdqa      [rdx],              xmm2            ; save 2
    788 
    789         movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
    790         punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
    791 
    792         movdqa      [rdx+16],           xmm3            ; save 3
    793 
    794         punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
    795 
    796         movdqa      [rdx+32],           xmm4            ; save 4
    797         movdqa      [rdx+48],           xmm5            ; save 5
    798         movdqa      xmm1,               t0              ; get
    799 
    800         movdqa      xmm2,               xmm1            ;
    801         punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
    802 
    803         punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
    804 %else
    805         movdqa      [rdx+112],          xmm7            ; save 7
    806 
    807         movdqa      [rdx+96],           xmm6            ; save 6
    808 
    809         movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
    810         punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
    811 
    812         punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
    813 
    814         movdqa      [rdx+32],           xmm2            ; save 2
    815 
    816         movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
    817         punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
    818 
    819         movdqa      [rdx+48],           xmm3            ; save 3
    820 
    821         punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
    822 
    823         movdqa      [rdx+64],           xmm4            ; save 4
    824         movdqa      [rdx+80],           xmm5            ; save 5
    825         movdqa      xmm1,               t0              ; get
    826 
    827         movdqa      xmm2,               xmm1
    828         punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
    829 
    830         punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
    831 
    832         movdqa      [rdx+16],           xmm1
    833 
    834         movdqa      [rdx],              xmm2
    835 %endif
    836 %endmacro
    837 
    838 %macro LFV_FILTER_MASK_HEV_MASK 1
    839         movdqa      xmm0,               xmm6            ; q2
    840         psubusb     xmm0,               xmm7            ; q2-q3
    841 
    842         psubusb     xmm7,               xmm6            ; q3-q2
    843         movdqa      xmm4,               xmm5            ; q1
    844 
    845         por         xmm7,               xmm0            ; abs (q3-q2)
    846         psubusb     xmm4,               xmm6            ; q1-q2
    847 
    848         movdqa      xmm0,               xmm1
    849         psubusb     xmm6,               xmm5            ; q2-q1
    850 
    851         por         xmm6,               xmm4            ; abs (q2-q1)
    852         psubusb     xmm0,               xmm2            ; p2 - p3;
    853 
    854         psubusb     xmm2,               xmm1            ; p3 - p2;
    855         por         xmm0,               xmm2            ; abs(p2-p3)
    856 %if %1
    857         movdqa      xmm2,               [rdx]           ; p1
    858 %else
    859         movdqa      xmm2,               [rdx+32]        ; p1
    860 %endif
    861         movdqa      xmm5,               xmm2            ; p1
    862         pmaxub      xmm0,               xmm7
    863 
    864         psubusb     xmm5,               xmm1            ; p1-p2
    865         psubusb     xmm1,               xmm2            ; p2-p1
    866 
    867         movdqa      xmm7,               xmm3            ; p0
    868         psubusb     xmm7,               xmm2            ; p0-p1
    869 
    870         por         xmm1,               xmm5            ; abs(p2-p1)
    871         pmaxub      xmm0,               xmm6
    872 
    873         pmaxub      xmm0,               xmm1
    874         movdqa      xmm1,               xmm2            ; p1
    875 
    876         psubusb     xmm2,               xmm3            ; p1-p0
    877         lea         rdx,                srct
    878 
    879         por         xmm2,               xmm7            ; abs(p1-p0)
    880 
    881         movdqa      t0,                 xmm2            ; save abs(p1-p0)
    882 
    883         pmaxub      xmm0,               xmm2
    884 
    885 %if %1
    886         movdqa      xmm5,               [rdx+32]        ; q0
    887         movdqa      xmm7,               [rdx+48]        ; q1
    888 %else
    889         movdqa      xmm5,               [rdx+64]        ; q0
    890         movdqa      xmm7,               [rdx+80]        ; q1
    891 %endif
    892         mov         rdx,                arg(3)          ; limit
    893 
    894         movdqa      xmm6,               xmm5            ; q0
    895         movdqa      xmm2,               xmm7            ; q1
    896 
    897         psubusb     xmm5,               xmm7            ; q0-q1
    898         psubusb     xmm7,               xmm6            ; q1-q0
    899 
    900         por         xmm7,               xmm5            ; abs(q1-q0)
    901 
    902         movdqa      t1,                 xmm7            ; save abs(q1-q0)
    903 
    904         movdqa      xmm4,               XMMWORD PTR [rdx]; limit
    905 
    906         pmaxub      xmm0,               xmm7
    907         mov         rdx,                arg(2)          ; flimit
    908 
    909         psubusb     xmm0,               xmm4
    910         movdqa      xmm5,               xmm2            ; q1
    911 
    912         psubusb     xmm5,               xmm1            ; q1-=p1
    913         psubusb     xmm1,               xmm2            ; p1-=q1
    914 
    915         por         xmm5,               xmm1            ; abs(p1-q1)
    916         movdqa      xmm1,               xmm3            ; p0
    917 
    918         pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
    919         psubusb     xmm1,               xmm6            ; p0-q0
    920 
    921         psrlw       xmm5,               1               ; abs(p1-q1)/2
    922         psubusb     xmm6,               xmm3            ; q0-p0
    923 
    924         movdqa      xmm2,               XMMWORD PTR [rdx]; flimit
    925 
    926         mov         rdx,                arg(4)          ; get thresh
    927 
    928         por         xmm1,               xmm6            ; abs(q0-p0)
    929         paddb       xmm2,               xmm2            ; flimit*2 (less than 255)
    930 
    931         movdqa      xmm6,               t0              ; get abs (q1 - q0)
    932 
    933         paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
    934 
    935         movdqa      xmm3,               t1              ; get abs (p1 - p0)
    936 
    937         movdqa      xmm7,               XMMWORD PTR [rdx]
    938 
    939         paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
    940         psubusb     xmm6,               xmm7            ; abs(q1 - q0) > thresh
    941 
    942         paddb       xmm4,               xmm2            ; flimit * 2 + limit (less than 255)
    943         psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh
    944 
    945         psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
    946         por         xmm6,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
    947 
    948         por         xmm1,               xmm0            ; mask
    949         pcmpeqb     xmm6,               xmm0
    950 
    951         pxor        xmm0,               xmm0
    952         pcmpeqb     xmm4,               xmm4
    953 
    954         pcmpeqb     xmm1,               xmm0
    955         pxor        xmm4,               xmm6
    956 %endmacro
    957 
    958 %macro BV_TRANSPOSE 0
    959         ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
    960         ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
    961         ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
    962         ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
    963         movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
    964         punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
    965 
    966         movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
    967         punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
    968 
    969         punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
    970 
    971         punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
    972 
    973         movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
    974         punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
    975 
    976         punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
    977         movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
    978 
    979         punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
    980 
    981         punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
    982         ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
    983         ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
    984         ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
    985         ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
    986 %endmacro
    987 
    988 %macro BV_WRITEBACK 2
    989         movd        [rsi+2],            %1
    990         psrldq      %1,                 4
    991 
    992         movd        [rdi+2],            %1
    993         psrldq      %1,                 4
    994 
    995         movd        [rsi+2*rax+2],      %1
    996         psrldq      %1,                 4
    997 
    998         movd        [rdi+2*rax+2],      %1
    999 
   1000         movd        [rsi+4*rax+2],      %2
   1001         psrldq      %2,                 4
   1002 
   1003         movd        [rdi+4*rax+2],      %2
   1004         psrldq      %2,                 4
   1005 
   1006         movd        [rsi+2*rcx+2],      %2
   1007         psrldq      %2,                 4
   1008 
   1009         movd        [rdi+2*rcx+2],      %2
   1010 %endmacro
   1011 
   1012 
   1013 ;void vp8_loop_filter_vertical_edge_sse2
   1014 ;(
   1015 ;    unsigned char *src_ptr,
   1016 ;    int            src_pixel_step,
   1017 ;    const char    *flimit,
   1018 ;    const char    *limit,
   1019 ;    const char    *thresh,
   1020 ;    int            count
   1021 ;)
   1022 global sym(vp8_loop_filter_vertical_edge_sse2)
   1023 sym(vp8_loop_filter_vertical_edge_sse2):
   1024     push        rbp
   1025     mov         rbp, rsp
   1026     SHADOW_ARGS_TO_STACK 6
   1027     SAVE_XMM
   1028     GET_GOT     rbx
   1029     push        rsi
   1030     push        rdi
   1031     ; end prolog
   1032 
   1033     ALIGN_STACK 16, rax
   1034     sub             rsp, 96      ; reserve 96 bytes
   1035     %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
   1036     %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
   1037     %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
   1038 
   1039         mov         rsi,        arg(0)                  ; src_ptr
   1040         movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
   1041 
   1042         lea         rsi,        [rsi - 4]
   1043         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
   1044         lea         rcx,        [rax*2+rax]
   1045 
   1046         ;transpose 16x8 to 8x16, and store the 8-line result on stack.
   1047         TRANSPOSE_16X8 1, 1
   1048 
   1049         ; calculate filter mask and high edge variance
   1050         LFV_FILTER_MASK_HEV_MASK 1
   1051 
   1052         ; start work on filters
   1053         B_FILTER 2
   1054 
   1055         ; tranpose and write back - only work on q1, q0, p0, p1
   1056         BV_TRANSPOSE
   1057         ; store 16-line result
   1058 
   1059         lea         rdx,        [rax]
   1060         neg         rdx
   1061 
   1062         BV_WRITEBACK xmm1, xmm5
   1063 
   1064         lea         rsi,        [rsi+rdx*8]
   1065         lea         rdi,        [rdi+rdx*8]
   1066         BV_WRITEBACK xmm2, xmm6
   1067 
   1068     add rsp, 96
   1069     pop rsp
   1070     ; begin epilog
   1071     pop rdi
   1072     pop rsi
   1073     RESTORE_GOT
   1074     RESTORE_XMM
   1075     UNSHADOW_ARGS
   1076     pop         rbp
   1077     ret
   1078 
   1079 
   1080 ;void vp8_loop_filter_vertical_edge_uv_sse2
   1081 ;(
   1082 ;    unsigned char *u,
   1083 ;    int            src_pixel_step,
   1084 ;    const char    *flimit,
   1085 ;    const char    *limit,
   1086 ;    const char    *thresh,
   1087 ;    unsigned char *v
   1088 ;)
   1089 global sym(vp8_loop_filter_vertical_edge_uv_sse2)
   1090 sym(vp8_loop_filter_vertical_edge_uv_sse2):
   1091     push        rbp
   1092     mov         rbp, rsp
   1093     SHADOW_ARGS_TO_STACK 6
   1094     SAVE_XMM
   1095     GET_GOT     rbx
   1096     push        rsi
   1097     push        rdi
   1098     ; end prolog
   1099 
   1100     ALIGN_STACK 16, rax
   1101     sub             rsp, 96      ; reserve 96 bytes
   1102     %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
   1103     %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
   1104     %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
   1105 
   1106         mov         rsi,        arg(0)                  ; u_ptr
   1107         movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
   1108 
   1109         lea         rsi,        [rsi - 4]
   1110         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
   1111         lea         rcx,        [rax+2*rax]
   1112 
   1113         lea         rdx,        srct
   1114 
   1115         ;transpose 16x8 to 8x16, and store the 8-line result on stack.
   1116         TRANSPOSE_16X8 0, 1
   1117 
   1118         ; calculate filter mask and high edge variance
   1119         LFV_FILTER_MASK_HEV_MASK 1
   1120 
   1121         ; start work on filters
   1122         B_FILTER 2
   1123 
   1124         ; tranpose and write back - only work on q1, q0, p0, p1
   1125         BV_TRANSPOSE
   1126 
   1127         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
   1128 
   1129         ; store 16-line result
   1130         BV_WRITEBACK xmm1, xmm5
   1131 
   1132         mov         rsi,        arg(0)                  ; u_ptr
   1133         lea         rsi,        [rsi - 4]
   1134         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
   1135         BV_WRITEBACK xmm2, xmm6
   1136 
   1137     add rsp, 96
   1138     pop rsp
   1139     ; begin epilog
   1140     pop rdi
   1141     pop rsi
   1142     RESTORE_GOT
   1143     RESTORE_XMM
   1144     UNSHADOW_ARGS
   1145     pop         rbp
   1146     ret
   1147 
   1148 %macro MBV_TRANSPOSE 0
   1149         movdqa      xmm0,               [rdx]               ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
   1150         movdqa      xmm1,               xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
   1151 
   1152         punpcklbw   xmm0,               xmm7                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
   1153         punpckhbw   xmm1,               xmm7                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
   1154 
   1155         movdqa      xmm2,               [rdx+32]            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1156         movdqa      xmm6,               xmm2                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1157 
   1158         punpcklbw   xmm2,               [rdx+48]            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
   1159         punpckhbw   xmm6,               [rdx+48]            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
   1160 
   1161         movdqa      xmm3,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
   1162         punpcklwd   xmm0,               xmm2                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
   1163 
   1164         punpckhwd   xmm3,               xmm2                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
   1165         movdqa      xmm4,               xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
   1166 
   1167         punpcklwd   xmm1,               xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
   1168         punpckhwd   xmm4,               xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
   1169 
   1170         movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
   1171         punpcklbw   xmm2,               [rdx+80]            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
   1172 
   1173         movdqa      xmm6,               xmm5                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
   1174         punpcklbw   xmm6,               [rdx+112]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
   1175 
   1176         movdqa      xmm7,               xmm2                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
   1177         punpcklwd   xmm2,               xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
   1178 
   1179         punpckhwd   xmm7,               xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
   1180         movdqa      xmm6,               xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
   1181 
   1182         punpckldq   xmm0,               xmm2                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
   1183         punpckhdq   xmm6,               xmm2                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
   1184 %endmacro
   1185 
   1186 %macro MBV_WRITEBACK_1 0
   1187         movq        QWORD  PTR [rsi],   xmm0
   1188         movhps      MMWORD PTR [rdi],   xmm0
   1189 
   1190         movq        QWORD  PTR [rsi+2*rax], xmm6
   1191         movhps      MMWORD PTR [rdi+2*rax], xmm6
   1192 
   1193         movdqa      xmm0,               xmm3                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
   1194         punpckldq   xmm0,               xmm7                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
   1195 
   1196         punpckhdq   xmm3,               xmm7                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
   1197 
   1198         movq        QWORD  PTR [rsi+4*rax], xmm0
   1199         movhps      MMWORD PTR [rdi+4*rax], xmm0
   1200 
   1201         movq        QWORD  PTR [rsi+2*rcx], xmm3
   1202         movhps      MMWORD PTR [rdi+2*rcx], xmm3
   1203 
   1204         movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
   1205         punpckhbw   xmm2,               [rdx+80]            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
   1206 
   1207         punpckhbw   xmm5,               [rdx+112]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
   1208         movdqa      xmm0,               xmm2
   1209 
   1210         punpcklwd   xmm0,               xmm5                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
   1211         punpckhwd   xmm2,               xmm5                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
   1212 
   1213         movdqa      xmm5,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
   1214         punpckldq   xmm1,               xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
   1215 
   1216         punpckhdq   xmm5,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
   1217 %endmacro
   1218 
   1219 %macro MBV_WRITEBACK_2 0
   1220         movq        QWORD  PTR [rsi],   xmm1
   1221         movhps      MMWORD PTR [rdi],   xmm1
   1222 
   1223         movq        QWORD  PTR [rsi+2*rax], xmm5
   1224         movhps      MMWORD PTR [rdi+2*rax], xmm5
   1225 
   1226         movdqa      xmm1,               xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
   1227         punpckldq   xmm1,               xmm2                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
   1228         punpckhdq   xmm4,               xmm2                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
   1229 
   1230         movq        QWORD  PTR [rsi+4*rax], xmm1
   1231         movhps      MMWORD PTR [rdi+4*rax], xmm1
   1232 
   1233         movq        QWORD  PTR [rsi+2*rcx], xmm4
   1234         movhps      MMWORD PTR [rdi+2*rcx], xmm4
   1235 %endmacro
   1236 
   1237 
   1238 ;void vp8_mbloop_filter_vertical_edge_sse2
   1239 ;(
   1240 ;    unsigned char *src_ptr,
   1241 ;    int            src_pixel_step,
   1242 ;    const char    *flimit,
   1243 ;    const char    *limit,
   1244 ;    const char    *thresh,
   1245 ;    int            count
   1246 ;)
   1247 global sym(vp8_mbloop_filter_vertical_edge_sse2)
   1248 sym(vp8_mbloop_filter_vertical_edge_sse2):
   1249     push        rbp
   1250     mov         rbp, rsp
   1251     SHADOW_ARGS_TO_STACK 6
   1252     SAVE_XMM
   1253     GET_GOT     rbx
   1254     push        rsi
   1255     push        rdi
   1256     ; end prolog
   1257 
   1258     ALIGN_STACK 16, rax
   1259     sub          rsp, 160     ; reserve 160 bytes
   1260     %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
   1261     %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
   1262     %define srct [rsp + 32]   ;__declspec(align(16)) char srct[128];
   1263 
   1264         mov         rsi,                arg(0)              ; src_ptr
   1265         movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
   1266 
   1267         lea         rsi,                [rsi - 4]
   1268         lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
   1269         lea         rcx,                [rax*2+rax]
   1270 
   1271         ; Transpose
   1272         TRANSPOSE_16X8 1, 0
   1273 
   1274         ; calculate filter mask and high edge variance
   1275         LFV_FILTER_MASK_HEV_MASK 0
   1276 
   1277         neg         rax
   1278         ; start work on filters
   1279         MB_FILTER_AND_WRITEBACK 2
   1280 
   1281         lea         rsi,                [rsi+rax*8]
   1282         lea         rdi,                [rdi+rax*8]
   1283 
   1284         ; transpose and write back
   1285         MBV_TRANSPOSE
   1286 
   1287         neg         rax
   1288 
   1289         MBV_WRITEBACK_1
   1290 
   1291         lea         rsi,                [rsi+rax*8]
   1292         lea         rdi,                [rdi+rax*8]
   1293         MBV_WRITEBACK_2
   1294 
   1295     add rsp, 160
   1296     pop rsp
   1297     ; begin epilog
   1298     pop rdi
   1299     pop rsi
   1300     RESTORE_GOT
   1301     RESTORE_XMM
   1302     UNSHADOW_ARGS
   1303     pop         rbp
   1304     ret
   1305 
   1306 
   1307 ;void vp8_mbloop_filter_vertical_edge_uv_sse2
   1308 ;(
   1309 ;    unsigned char *u,
   1310 ;    int            src_pixel_step,
   1311 ;    const char    *flimit,
   1312 ;    const char    *limit,
   1313 ;    const char    *thresh,
   1314 ;    unsigned char *v
   1315 ;)
   1316 global sym(vp8_mbloop_filter_vertical_edge_uv_sse2)
   1317 sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
   1318     push        rbp
   1319     mov         rbp, rsp
   1320     SHADOW_ARGS_TO_STACK 6
   1321     SAVE_XMM
   1322     GET_GOT     rbx
   1323     push        rsi
   1324     push        rdi
   1325     ; end prolog
   1326 
   1327     ALIGN_STACK 16, rax
   1328     sub          rsp, 160     ; reserve 160 bytes
   1329     %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
   1330     %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
   1331     %define srct [rsp + 32]   ;__declspec(align(16)) char srct[128];
   1332 
   1333         mov         rsi,                arg(0)              ; u_ptr
   1334         movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
   1335 
   1336         lea         rsi,                [rsi - 4]
   1337         lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
   1338         lea         rcx,                [rax+2*rax]
   1339 
   1340         lea         rdx,                srct
   1341 
   1342         ; Transpose
   1343         TRANSPOSE_16X8 0, 0
   1344 
   1345         ; calculate filter mask and high edge variance
   1346         LFV_FILTER_MASK_HEV_MASK 0
   1347 
   1348         ; start work on filters
   1349         MB_FILTER_AND_WRITEBACK 2
   1350 
   1351         ; transpose and write back
   1352         MBV_TRANSPOSE
   1353 
   1354         mov         rsi,                arg(0)             ;u_ptr
   1355         lea         rsi,                [rsi - 4]
   1356         lea         rdi,                [rsi + rax]
   1357         MBV_WRITEBACK_1
   1358         mov         rsi,                arg(5)             ;v_ptr
   1359         lea         rsi,                [rsi - 4]
   1360         lea         rdi,                [rsi + rax]
   1361         MBV_WRITEBACK_2
   1362 
   1363     add rsp, 160
   1364     pop rsp
   1365     ; begin epilog
   1366     pop rdi
   1367     pop rsi
   1368     RESTORE_GOT
   1369     RESTORE_XMM
   1370     UNSHADOW_ARGS
   1371     pop         rbp
   1372     ret
   1373 
   1374 
   1375 ;void vp8_loop_filter_simple_horizontal_edge_sse2
   1376 ;(
   1377 ;    unsigned char *src_ptr,
   1378 ;    int  src_pixel_step,
   1379 ;    const char *flimit,
   1380 ;    const char *limit,
   1381 ;    const char *thresh,
   1382 ;    int count
   1383 ;)
   1384 global sym(vp8_loop_filter_simple_horizontal_edge_sse2)
   1385 sym(vp8_loop_filter_simple_horizontal_edge_sse2):
   1386     push        rbp
   1387     mov         rbp, rsp
   1388     SHADOW_ARGS_TO_STACK 6
   1389     SAVE_XMM
   1390     GET_GOT     rbx
   1391     push        rsi
   1392     push        rdi
   1393     ; end prolog
   1394 
   1395         mov         rsi, arg(0)             ;src_ptr
   1396         movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
   1397         mov         rdx, arg(2) ;flimit     ; get flimit
   1398         movdqa      xmm3, XMMWORD PTR [rdx]
   1399         mov         rdx, arg(3) ;limit
   1400         movdqa      xmm7, XMMWORD PTR [rdx]
   1401 
   1402         paddb       xmm3, xmm3              ; flimit*2 (less than 255)
   1403         paddb       xmm3, xmm7              ; flimit * 2 + limit (less than 255)
   1404 
   1405         mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
   1406         add         rdi, rax
   1407         neg         rax
   1408 
   1409         ; calculate mask
   1410         movdqu      xmm1, [rsi+2*rax]       ; p1
   1411         movdqu      xmm0, [rdi]             ; q1
   1412         movdqa      xmm2, xmm1
   1413         movdqa      xmm7, xmm0
   1414         movdqa      xmm4, xmm0
   1415         psubusb     xmm0, xmm1              ; q1-=p1
   1416         psubusb     xmm1, xmm4              ; p1-=q1
   1417         por         xmm1, xmm0              ; abs(p1-q1)
   1418         pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero
   1419         psrlw       xmm1, 1                 ; abs(p1-q1)/2
   1420 
   1421         movdqu      xmm5, [rsi+rax]         ; p0
   1422         movdqu      xmm4, [rsi]             ; q0
   1423         movdqa      xmm0, xmm4              ; q0
   1424         movdqa      xmm6, xmm5              ; p0
   1425         psubusb     xmm5, xmm4              ; p0-=q0
   1426         psubusb     xmm4, xmm6              ; q0-=p0
   1427         por         xmm5, xmm4              ; abs(p0 - q0)
   1428         paddusb     xmm5, xmm5              ; abs(p0-q0)*2
   1429         paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
   1430 
   1431         psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
   1432         pxor        xmm3, xmm3
   1433         pcmpeqb     xmm5, xmm3
   1434 
   1435         ; start work on filters
   1436         pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values
   1437         pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values
   1438         psubsb      xmm2, xmm7              ; p1 - q1
   1439 
   1440         pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values
   1441         pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values
   1442         movdqa      xmm3, xmm0              ; q0
   1443         psubsb      xmm0, xmm6              ; q0 - p0
   1444         paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
   1445         paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
   1446         paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
   1447         pand        xmm5, xmm2              ; mask filter values we don't care about
   1448 
   1449         ; do + 4 side
   1450         paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4
   1451 
   1452         movdqa      xmm0, xmm5              ; get a copy of filters
   1453         psllw       xmm0, 8                 ; shift left 8
   1454         psraw       xmm0, 3                 ; arithmetic shift right 11
   1455         psrlw       xmm0, 8
   1456         movdqa      xmm1, xmm5              ; get a copy of filters
   1457         psraw       xmm1, 11                ; arithmetic shift right 11
   1458         psllw       xmm1, 8                 ; shift left 8 to put it back
   1459 
   1460         por         xmm0, xmm1              ; put the two together to get result
   1461 
   1462         psubsb      xmm3, xmm0              ; q0-= q0 add
   1463         pxor        xmm3, [GLOBAL(t80)]     ; unoffset
   1464         movdqu      [rsi], xmm3             ; write back
   1465 
   1466         ; now do +3 side
   1467         psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4
   1468 
   1469         movdqa      xmm0, xmm5              ; get a copy of filters
   1470         psllw       xmm0, 8                 ; shift left 8
   1471         psraw       xmm0, 3                 ; arithmetic shift right 11
   1472         psrlw       xmm0, 8
   1473         psraw       xmm5, 11                ; arithmetic shift right 11
   1474         psllw       xmm5, 8                 ; shift left 8 to put it back
   1475         por         xmm0, xmm5              ; put the two together to get result
   1476 
   1477 
   1478         paddsb      xmm6, xmm0              ; p0+= p0 add
   1479         pxor        xmm6, [GLOBAL(t80)]     ; unoffset
   1480         movdqu      [rsi+rax], xmm6         ; write back
   1481 
   1482     ; begin epilog
   1483     pop rdi
   1484     pop rsi
   1485     RESTORE_GOT
   1486     RESTORE_XMM
   1487     UNSHADOW_ARGS
   1488     pop         rbp
   1489     ret
   1490 
   1491 
   1492 ;void vp8_loop_filter_simple_vertical_edge_sse2
   1493 ;(
   1494 ;    unsigned char *src_ptr,
   1495 ;    int  src_pixel_step,
   1496 ;    const char *flimit,
   1497 ;    const char *limit,
   1498 ;    const char *thresh,
   1499 ;    int count
   1500 ;)
   1501 global sym(vp8_loop_filter_simple_vertical_edge_sse2)
   1502 sym(vp8_loop_filter_simple_vertical_edge_sse2):
   1503     push        rbp         ; save old base pointer value.
   1504     mov         rbp, rsp    ; set new base pointer value.
   1505     SHADOW_ARGS_TO_STACK 6
   1506     SAVE_XMM
   1507     GET_GOT     rbx         ; save callee-saved reg
   1508     push        rsi
   1509     push        rdi
   1510     ; end prolog
   1511 
   1512     ALIGN_STACK 16, rax
   1513     sub         rsp, 32                         ; reserve 32 bytes
   1514     %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
   1515     %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
   1516 
   1517         mov         rsi, arg(0) ;src_ptr
   1518         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
   1519 
   1520         lea         rsi,        [rsi - 2 ]
   1521         lea         rdi,        [rsi + rax]
   1522         lea         rdx,        [rsi + rax*4]
   1523         lea         rcx,        [rdx + rax]
   1524 
   1525         movdqu      xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
   1526         movdqu      xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
   1527         movdqu      xmm2,       [rdi]                   ; 13 12 11 10
   1528         movdqu      xmm3,       [rcx]                   ; 53 52 51 50
   1529         punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
   1530         punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
   1531 
   1532         movdqu      xmm4,       [rsi + rax*2]           ; 23 22 21 20
   1533         movdqu      xmm5,       [rdx + rax*2]           ; 63 62 61 60
   1534         movdqu      xmm6,       [rdi + rax*2]           ; 33 32 31 30
   1535         movdqu      xmm7,       [rcx + rax*2]           ; 73 72 71 70
   1536         punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
   1537         punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
   1538 
   1539         punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
   1540         punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
   1541 
   1542         movdqa      xmm1,       xmm0
   1543         punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
   1544         punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
   1545 
   1546         movdqa      xmm2,       xmm0
   1547         punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
   1548         punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
   1549 
   1550         movdqa      t0,         xmm0                    ; save to t0
   1551         movdqa      t1,         xmm2                    ; save to t1
   1552 
   1553         lea         rsi,        [rsi + rax*8]
   1554         lea         rdi,        [rsi + rax]
   1555         lea         rdx,        [rsi + rax*4]
   1556         lea         rcx,        [rdx + rax]
   1557 
   1558         movdqu      xmm4,       [rsi]                   ; 83 82 81 80
   1559         movdqu      xmm1,       [rdx]                   ; c3 c2 c1 c0
   1560         movdqu      xmm6,       [rdi]                   ; 93 92 91 90
   1561         movdqu      xmm3,       [rcx]                   ; d3 d2 d1 d0
   1562         punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
   1563         punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
   1564 
   1565         movdqu      xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
   1566         movdqu      xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
   1567         movdqu      xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
   1568         movdqu      xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
   1569         punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
   1570         punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
   1571 
   1572         punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
   1573         punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
   1574 
   1575         movdqa      xmm1,       xmm4
   1576         punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
   1577         punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
   1578 
   1579         movdqa      xmm6,       xmm4
   1580         punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
   1581         punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
   1582 
   1583         movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
   1584         movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
   1585         movdqa      xmm1,       xmm0
   1586         movdqa      xmm3,       xmm2
   1587 
   1588         punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
   1589         punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
   1590         punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1591         punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
   1592 
   1593         ; calculate mask
   1594         movdqa      xmm6,       xmm0                            ; p1
   1595         movdqa      xmm7,       xmm3                            ; q1
   1596         psubusb     xmm7,       xmm0                            ; q1-=p1
   1597         psubusb     xmm6,       xmm3                            ; p1-=q1
   1598         por         xmm6,       xmm7                            ; abs(p1-q1)
   1599         pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
   1600         psrlw       xmm6,       1                               ; abs(p1-q1)/2
   1601 
   1602         movdqa      xmm5,       xmm1                            ; p0
   1603         movdqa      xmm4,       xmm2                            ; q0
   1604         psubusb     xmm5,       xmm2                            ; p0-=q0
   1605         psubusb     xmm4,       xmm1                            ; q0-=p0
   1606         por         xmm5,       xmm4                            ; abs(p0 - q0)
   1607         paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
   1608         paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
   1609 
   1610         mov         rdx,        arg(2)                          ;flimit
   1611         movdqa      xmm7, XMMWORD PTR [rdx]
   1612         mov         rdx,        arg(3)                          ; get limit
   1613         movdqa      xmm6, XMMWORD PTR [rdx]
   1614         paddb       xmm7,        xmm7                           ; flimit*2 (less than 255)
   1615         paddb       xmm7,        xmm6                           ; flimit * 2 + limit (less than 255)
   1616 
   1617         psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
   1618         pxor        xmm7,        xmm7
   1619         pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
   1620 
   1621         ; start work on filters
   1622         movdqa        t0,        xmm0
   1623         movdqa        t1,        xmm3
   1624 
   1625         pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values
   1626         pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values
   1627 
   1628         psubsb      xmm0,        xmm3                           ; p1 - q1
   1629         movdqa      xmm6,        xmm1                           ; p0
   1630 
   1631         movdqa      xmm7,        xmm2                           ; q0
   1632         pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values
   1633 
   1634         pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values
   1635         movdqa      xmm3,        xmm7                           ; offseted ; q0
   1636 
   1637         psubsb      xmm7,        xmm6                           ; q0 - p0
   1638         paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)
   1639 
   1640         paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)
   1641         paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)
   1642 
   1643         pand        xmm5,        xmm0                           ; mask filter values we don't care about
   1644 
   1645 
   1646         paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4
   1647 
   1648         movdqa      xmm0,        xmm5                           ; get a copy of filters
   1649         psllw       xmm0,        8                              ; shift left 8
   1650 
   1651         psraw       xmm0,        3                              ; arithmetic shift right 11
   1652         psrlw       xmm0,        8
   1653 
   1654         movdqa      xmm7,        xmm5                           ; get a copy of filters
   1655         psraw       xmm7,        11                             ; arithmetic shift right 11
   1656 
   1657         psllw       xmm7,        8                              ; shift left 8 to put it back
   1658         por         xmm0,        xmm7                           ; put the two together to get result
   1659 
   1660         psubsb      xmm3,        xmm0                           ; q0-= q0sz add
   1661         pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0
   1662 
   1663         ; now do +3 side
   1664         psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4
   1665         movdqa      xmm0,        xmm5                           ; get a copy of filters
   1666 
   1667         psllw       xmm0,        8                              ; shift left 8
   1668         psraw       xmm0,        3                              ; arithmetic shift right 11
   1669 
   1670         psrlw       xmm0,        8
   1671         psraw       xmm5,        11                             ; arithmetic shift right 11
   1672 
   1673         psllw       xmm5,        8                              ; shift left 8 to put it back
   1674         por         xmm0,        xmm5                           ; put the two together to get result
   1675 
   1676         paddsb      xmm6,        xmm0                           ; p0+= p0 add
   1677         pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0
   1678 
   1679         movdqa      xmm0,        t0                             ; p1
   1680         movdqa      xmm4,        t1                             ; q1
   1681 
   1682         ; transpose back to write out
   1683         ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
   1684         ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
   1685         ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1686         ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
   1687         movdqa      xmm1,       xmm0
   1688         punpcklbw   xmm0,       xmm6                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
   1689         punpckhbw   xmm1,       xmm6                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
   1690 
   1691         movdqa      xmm5,       xmm3
   1692         punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
   1693         punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
   1694 
   1695         movdqa      xmm2,       xmm0
   1696         punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
   1697         punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
   1698 
   1699         movdqa      xmm3,       xmm1
   1700         punpcklwd   xmm1,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
   1701         punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
   1702 
   1703         ; write out order: xmm0 xmm2 xmm1 xmm3
   1704         lea         rdx,        [rsi + rax*4]
   1705 
   1706         movd        [rsi],      xmm1                               ; write the second 8-line result
   1707         psrldq      xmm1,       4
   1708         movd        [rdi],      xmm1
   1709         psrldq      xmm1,       4
   1710         movd        [rsi + rax*2], xmm1
   1711         psrldq      xmm1,       4
   1712         movd        [rdi + rax*2], xmm1
   1713 
   1714         movd        [rdx],      xmm3
   1715         psrldq      xmm3,       4
   1716         movd        [rcx],      xmm3
   1717         psrldq      xmm3,       4
   1718         movd        [rdx + rax*2], xmm3
   1719         psrldq      xmm3,       4
   1720         movd        [rcx + rax*2], xmm3
   1721 
   1722         neg         rax
   1723         lea         rsi,        [rsi + rax*8]
   1724         neg         rax
   1725         lea         rdi,        [rsi + rax]
   1726         lea         rdx,        [rsi + rax*4]
   1727         lea         rcx,        [rdx + rax]
   1728 
   1729         movd        [rsi],      xmm0                                ; write the first 8-line result
   1730         psrldq      xmm0,       4
   1731         movd        [rdi],      xmm0
   1732         psrldq      xmm0,       4
   1733         movd        [rsi + rax*2], xmm0
   1734         psrldq      xmm0,       4
   1735         movd        [rdi + rax*2], xmm0
   1736 
   1737         movd        [rdx],      xmm2
   1738         psrldq      xmm2,       4
   1739         movd        [rcx],      xmm2
   1740         psrldq      xmm2,       4
   1741         movd        [rdx + rax*2], xmm2
   1742         psrldq      xmm2,       4
   1743         movd        [rcx + rax*2], xmm2
   1744 
   1745     add rsp, 32
   1746     pop rsp
   1747     ; begin epilog
   1748     pop rdi
   1749     pop rsi
   1750     RESTORE_GOT
   1751     RESTORE_XMM
   1752     UNSHADOW_ARGS
   1753     pop         rbp
   1754     ret
   1755 
   1756 SECTION_RODATA
   1757 align 16
   1758 tfe:
   1759     times 16 db 0xfe
   1760 align 16
   1761 t80:
   1762     times 16 db 0x80
   1763 align 16
   1764 t1s:
   1765     times 16 db 0x01
   1766 align 16
   1767 t3:
   1768     times 16 db 0x03
   1769 align 16
   1770 t4:
   1771     times 16 db 0x04
   1772 align 16
   1773 ones:
   1774     times 8 dw 0x0001
   1775 align 16
   1776 s9:
   1777     times 8 dw 0x0900
   1778 align 16
   1779 s63:
   1780     times 8 dw 0x003f
   1781