Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 
     15 %macro LFH_FILTER_MASK 1
     16 %if %1
     17         movdqa      xmm2,                   [rdi+2*rax]       ; q3
     18         movdqa      xmm1,                   [rsi+2*rax]       ; q2
     19 %else
     20         movq        xmm0,                   [rsi + rcx*2]     ; q3
     21         movq        xmm2,                   [rdi + rcx*2]
     22         pslldq      xmm2,                   8
     23         por         xmm2,                   xmm0
     24         movq        xmm1,                   [rsi + rcx]       ; q2
     25         movq        xmm3,                   [rdi + rcx]
     26         pslldq      xmm3,                   8
     27         por         xmm1,                   xmm3
     28         movdqa      XMMWORD PTR [rsp],      xmm1              ; store q2
     29 %endif
     30 
     31         movdqa      xmm6,                   xmm1              ; q2
     32         psubusb     xmm1,                   xmm2              ; q2-=q3
     33         psubusb     xmm2,                   xmm6              ; q3-=q2
     34         por         xmm1,                   xmm2              ; abs(q3-q2)
     35 
     36         psubusb     xmm1,                   xmm7
     37 
     38 %if %1
     39         movdqa      xmm4,                   [rsi+rax]         ; q1
     40 %else
     41         movq        xmm0,                   [rsi]             ; q1
     42         movq        xmm4,                   [rdi]
     43         pslldq      xmm4,                   8
     44         por         xmm4,                   xmm0
     45         movdqa      XMMWORD PTR [rsp + 16], xmm4              ; store q1
     46 %endif
     47 
     48         movdqa      xmm3,                   xmm4              ; q1
     49         psubusb     xmm4,                   xmm6              ; q1-=q2
     50         psubusb     xmm6,                   xmm3              ; q2-=q1
     51         por         xmm4,                   xmm6              ; abs(q2-q1)
     52         psubusb     xmm4,                   xmm7
     53 
     54         por         xmm1,                   xmm4
     55 
     56 %if %1
     57         movdqa      xmm4,                   [rsi]             ; q0
     58 %else
     59         movq        xmm4,                   [rsi + rax]       ; q0
     60         movq        xmm0,                   [rdi + rax]
     61         pslldq      xmm0,                   8
     62         por         xmm4,                   xmm0
     63 %endif
     64 
     65         movdqa      xmm0,                   xmm4              ; q0
     66         psubusb     xmm4,                   xmm3              ; q0-=q1
     67         psubusb     xmm3,                   xmm0              ; q1-=q0
     68         por         xmm4,                   xmm3              ; abs(q0-q1)
     69         movdqa      t0,                     xmm4              ; save to t0
     70 
     71         psubusb     xmm4,                   xmm7
     72         por         xmm1,                   xmm4
     73 
     74 %if %1
     75         neg         rax                     ; negate pitch to deal with above border
     76 
     77         movdqa      xmm2,                   [rsi+4*rax]       ; p3
     78         movdqa      xmm4,                   [rdi+4*rax]       ; p2
     79 %else
     80         lea         rsi,                    [rsi + rax*4]
     81         lea         rdi,                    [rdi + rax*4]
     82 
     83         movq        xmm2,                   [rsi + rax]       ; p3
     84         movq        xmm3,                   [rdi + rax]
     85         pslldq      xmm3,                   8
     86         por         xmm2,                   xmm3
     87         movq        xmm4,                   [rsi]             ; p2
     88         movq        xmm5,                   [rdi]
     89         pslldq      xmm5,                   8
     90         por         xmm4,                   xmm5
     91         movdqa      XMMWORD PTR [rsp + 32], xmm4              ; store p2
     92 %endif
     93 
     94         movdqa      xmm5,                   xmm4              ; p2
     95         psubusb     xmm4,                   xmm2              ; p2-=p3
     96         psubusb     xmm2,                   xmm5              ; p3-=p2
     97         por         xmm4,                   xmm2              ; abs(p3 - p2)
     98 
     99         psubusb     xmm4,                   xmm7
    100         por         xmm1,                   xmm4
    101 
    102 %if %1
    103         movdqa      xmm4,                   [rsi+2*rax]       ; p1
    104 %else
    105         movq        xmm4,                   [rsi + rcx]       ; p1
    106         movq        xmm3,                   [rdi + rcx]
    107         pslldq      xmm3,                   8
    108         por         xmm4,                   xmm3
    109         movdqa      XMMWORD PTR [rsp + 48], xmm4              ; store p1
    110 %endif
    111 
    112         movdqa      xmm3,                   xmm4              ; p1
    113         psubusb     xmm4,                   xmm5              ; p1-=p2
    114         psubusb     xmm5,                   xmm3              ; p2-=p1
    115         por         xmm4,                   xmm5              ; abs(p2 - p1)
    116         psubusb     xmm4,                   xmm7
    117 
    118         por         xmm1,                   xmm4
    119         movdqa      xmm2,                   xmm3              ; p1
    120 
    121 %if %1
    122         movdqa      xmm4,                   [rsi+rax]         ; p0
    123 %else
    124         movq        xmm4,                   [rsi + rcx*2]     ; p0
    125         movq        xmm5,                   [rdi + rcx*2]
    126         pslldq      xmm5,                   8
    127         por         xmm4,                   xmm5
    128 %endif
    129 
    130         movdqa      xmm5,                   xmm4              ; p0
    131         psubusb     xmm4,                   xmm3              ; p0-=p1
    132         psubusb     xmm3,                   xmm5              ; p1-=p0
    133         por         xmm4,                   xmm3              ; abs(p1 - p0)
    134         movdqa        t1,                   xmm4              ; save to t1
    135 
    136         psubusb     xmm4,                   xmm7
    137         por         xmm1,                   xmm4
    138 
    139 %if %1
    140         movdqa      xmm3,                   [rdi]             ; q1
    141 %else
    142         movdqa      xmm3,                   q1                ; q1
    143 %endif
    144 
    145         movdqa      xmm4,                   xmm3              ; q1
    146         psubusb     xmm3,                   xmm2              ; q1-=p1
    147         psubusb     xmm2,                   xmm4              ; p1-=q1
    148         por         xmm2,                   xmm3              ; abs(p1-q1)
    149         pand        xmm2,                   [tfe GLOBAL]      ; set lsb of each byte to zero
    150         psrlw       xmm2,                   1                 ; abs(p1-q1)/2
    151 
    152         movdqa      xmm6,                   xmm5              ; p0
    153         movdqa      xmm3,                   xmm0              ; q0
    154         psubusb     xmm5,                   xmm3              ; p0-=q0
    155         psubusb     xmm3,                   xmm6              ; q0-=p0
    156         por         xmm5,                   xmm3              ; abs(p0 - q0)
    157         paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
    158         paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
    159 
    160         mov         rdx,                    arg(2)            ; get flimit
    161         movdqa      xmm2,                   XMMWORD PTR [rdx]
    162         paddb       xmm2,                   xmm2              ; flimit*2 (less than 255)
    163         paddb       xmm7,                   xmm2              ; flimit * 2 + limit (less than 255)
    164 
    165         psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
    166         por         xmm1,                   xmm5
    167         pxor        xmm5,                   xmm5
    168         pcmpeqb     xmm1,                   xmm5              ; mask mm1
    169 %endmacro
    170 
    171 %macro LFH_HEV_MASK 0
    172         mov         rdx,                    arg(4)            ; get thresh
    173         movdqa      xmm7,                   XMMWORD PTR [rdx]
    174 
    175         movdqa      xmm4,                   t0                ; get abs (q1 - q0)
    176         psubusb     xmm4,                   xmm7
    177         movdqa      xmm3,                   t1                ; get abs (p1 - p0)
    178         psubusb     xmm3,                   xmm7
    179         paddb       xmm4,                   xmm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
    180         pcmpeqb     xmm4,                   xmm5
    181 
    182         pcmpeqb     xmm5,                   xmm5
    183         pxor        xmm4,                   xmm5
    184 %endmacro
    185 
    186 %macro BH_FILTER 1
    187 %if %1
    188         movdqa      xmm2,                   [rsi+2*rax]       ; p1
    189         movdqa      xmm7,                   [rdi]             ; q1
    190 %else
    191         movdqa      xmm2,                   p1                ; p1
    192         movdqa      xmm7,                   q1                ; q1
    193 %endif
    194 
    195         pxor        xmm2,                   [t80 GLOBAL]      ; p1 offset to convert to signed values
    196         pxor        xmm7,                   [t80 GLOBAL]      ; q1 offset to convert to signed values
    197 
    198         psubsb      xmm2,                   xmm7              ; p1 - q1
    199         pxor        xmm6,                   [t80 GLOBAL]      ; offset to convert to signed values
    200 
    201         pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
    202         pxor        xmm0,                   [t80 GLOBAL]      ; offset to convert to signed values
    203 
    204         movdqa      xmm3,                   xmm0              ; q0
    205         psubsb      xmm0,                   xmm6              ; q0 - p0
    206         paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
    207         paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
    208         paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
    209         pand        xmm1,                   xmm2              ; mask filter values we don't care about
    210         movdqa      xmm2,                   xmm1
    211         paddsb      xmm1,                   [t4 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 4
    212         paddsb      xmm2,                   [t3 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 3
    213 
    214         punpckhbw   xmm5,                   xmm2              ; axbxcxdx
    215         punpcklbw   xmm2,                   xmm2              ; exfxgxhx
    216 
    217         psraw       xmm5,                   11                ; sign extended shift right by 3
    218         psraw       xmm2,                   11                ; sign extended shift right by 3
    219         packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
    220 
    221         punpcklbw   xmm0,                   xmm1              ; exfxgxhx
    222         punpckhbw   xmm1,                   xmm1              ; axbxcxdx
    223 
    224         psraw       xmm0,                   11                ; sign extended shift right by 3
    225         psraw       xmm1,                   11                ; sign extended shift right by 3
    226 
    227         movdqa      xmm5,                   xmm0              ; save results
    228         packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
    229 
    230         paddsw      xmm5,                   [ones GLOBAL]
    231         paddsw      xmm1,                   [ones GLOBAL]
    232 
    233         psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
    234         psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
    235 
    236         packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
    237         pandn       xmm4,                   xmm5              ; high edge variance additive
    238 %endmacro
    239 
    240 %macro BH_WRITEBACK 1
    241         paddsb      xmm6,                   xmm2              ; p0+= p0 add
    242         pxor        xmm6,                   [t80 GLOBAL]      ; unoffset
    243 %if %1
    244         movdqa      [rsi+rax],              xmm6              ; write back
    245 %else
    246         lea         rsi,                    [rsi + rcx*2]
    247         lea         rdi,                    [rdi + rcx*2]
    248         movq        MMWORD PTR [rsi],       xmm6              ; p0
    249         psrldq      xmm6,                   8
    250         movq        MMWORD PTR [rdi],       xmm6
    251 %endif
    252 
    253 %if %1
    254         movdqa      xmm6,                   [rsi+2*rax]       ; p1
    255 %else
    256         movdqa      xmm6,                   p1                ; p1
    257 %endif
    258         pxor        xmm6,                   [t80 GLOBAL]      ; reoffset
    259         paddsb      xmm6,                   xmm4               ; p1+= p1 add
    260         pxor        xmm6,                   [t80 GLOBAL]      ; unoffset
    261 %if %1
    262         movdqa      [rsi+2*rax],            xmm6              ; write back
    263 %else
    264         movq        MMWORD PTR [rsi + rax], xmm6              ; p1
    265         psrldq      xmm6,                   8
    266         movq        MMWORD PTR [rdi + rax], xmm6
    267 %endif
    268 
    269         psubsb      xmm3,                   xmm0              ; q0-= q0 add
    270         pxor        xmm3,                   [t80 GLOBAL]      ; unoffset
    271 %if %1
    272         movdqa      [rsi],                  xmm3              ; write back
    273 %else
    274         movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
    275         psrldq      xmm3,                   8
    276         movq        MMWORD PTR [rdi + rcx], xmm3
    277 %endif
    278 
    279         psubsb      xmm7,                   xmm4              ; q1-= q1 add
    280         pxor        xmm7,                   [t80 GLOBAL]      ; unoffset
    281 %if %1
    282         movdqa      [rdi],                  xmm7              ; write back
    283 %else
    284         movq        MMWORD PTR [rsi + rcx*2],xmm7             ; q1
    285         psrldq      xmm7,                   8
    286         movq        MMWORD PTR [rdi + rcx*2],xmm7
    287 %endif
    288 %endmacro
    289 
    290 
    291 ;void vp8_loop_filter_horizontal_edge_sse2
    292 ;(
    293 ;    unsigned char *src_ptr,
    294 ;    int            src_pixel_step,
    295 ;    const char    *flimit,
    296 ;    const char    *limit,
    297 ;    const char    *thresh,
    298 ;    int            count
    299 ;)
    300 global sym(vp8_loop_filter_horizontal_edge_sse2)
    301 sym(vp8_loop_filter_horizontal_edge_sse2):
    302     push        rbp
    303     mov         rbp, rsp
    304     SHADOW_ARGS_TO_STACK 6
    305     SAVE_XMM
    306     GET_GOT     rbx
    307     push        rsi
    308     push        rdi
    309     ; end prolog
    310 
    311     ALIGN_STACK 16, rax
    312     sub         rsp, 32     ; reserve 32 bytes
    313     %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
    314     %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
    315 
    316         mov         rsi,                    arg(0)           ;src_ptr
    317         movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
    318 
    319         mov         rdx,                    arg(3)           ;limit
    320         movdqa      xmm7,                   XMMWORD PTR [rdx]
    321 
    322         lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
    323 
    324         ; calculate breakout conditions
    325         LFH_FILTER_MASK 1
    326 
    327         ; calculate high edge variance
    328         LFH_HEV_MASK
    329 
    330         ; start work on filters
    331         BH_FILTER 1
    332         ; write back the result
    333         BH_WRITEBACK 1
    334 
    335     add rsp, 32
    336     pop rsp
    337     ; begin epilog
    338     pop rdi
    339     pop rsi
    340     RESTORE_GOT
    341     RESTORE_XMM
    342     UNSHADOW_ARGS
    343     pop         rbp
    344     ret
    345 
    346 
    347 ;void vp8_loop_filter_horizontal_edge_uv_sse2
    348 ;(
    349 ;    unsigned char *src_ptr,
    350 ;    int            src_pixel_step,
    351 ;    const char    *flimit,
    352 ;    const char    *limit,
    353 ;    const char    *thresh,
    354 ;    int            count
    355 ;)
    356 global sym(vp8_loop_filter_horizontal_edge_uv_sse2)
    357 sym(vp8_loop_filter_horizontal_edge_uv_sse2):
    358     push        rbp
    359     mov         rbp, rsp
    360     SHADOW_ARGS_TO_STACK 6
    361     SAVE_XMM
    362     GET_GOT     rbx
    363     push        rsi
    364     push        rdi
    365     ; end prolog
    366 
    367     ALIGN_STACK 16, rax
    368     sub         rsp, 96       ; reserve 96 bytes
    369     %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
    370     %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
    371     %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
    372     %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
    373     %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
    374     %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
    375 
    376         mov         rsi,                    arg(0)             ; u
    377         mov         rdi,                    arg(5)             ; v
    378         movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
    379         mov         rcx,                    rax
    380         neg         rax                     ; negate pitch to deal with above border
    381 
    382         mov         rdx,                    arg(3)             ;limit
    383         movdqa      xmm7,                   XMMWORD PTR [rdx]
    384 
    385         lea         rsi,                    [rsi + rcx]
    386         lea         rdi,                    [rdi + rcx]
    387 
    388         ; calculate breakout conditions
    389         LFH_FILTER_MASK 0
    390         ; calculate high edge variance
    391         LFH_HEV_MASK
    392 
    393         ; start work on filters
    394         BH_FILTER 0
    395         ; write back the result
    396         BH_WRITEBACK 0
    397 
    398     add rsp, 96
    399     pop rsp
    400     ; begin epilog
    401     pop rdi
    402     pop rsi
    403     RESTORE_GOT
    404     RESTORE_XMM
    405     UNSHADOW_ARGS
    406     pop         rbp
    407     ret
    408 
    409 
    410 %macro MBH_FILTER 1
    411 %if %1
    412         movdqa      xmm2,                   [rsi+2*rax]       ; p1
    413         movdqa      xmm7,                   [rdi]             ; q1
    414 %else
    415         movdqa      xmm2,                   p1                ; p1
    416         movdqa      xmm7,                   q1                ; q1
    417 %endif
    418         pxor        xmm2,                   [t80 GLOBAL]      ; p1 offset to convert to signed values
    419         pxor        xmm7,                   [t80 GLOBAL]      ; q1 offset to convert to signed values
    420 
    421         psubsb      xmm2,                   xmm7              ; p1 - q1
    422         pxor        xmm6,                   [t80 GLOBAL]      ; offset to convert to signed values
    423         pxor        xmm0,                   [t80 GLOBAL]      ; offset to convert to signed values
    424         movdqa      xmm3,                   xmm0              ; q0
    425         psubsb      xmm0,                   xmm6              ; q0 - p0
    426         paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + (p1 - q1)
    427         paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0)
    428         paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + (p1 - q1)
    429 
    430         pand        xmm1,                   xmm2              ; mask filter values we don't care about
    431         movdqa      xmm2,                   xmm1              ; vp8_filter
    432         pand        xmm2,                   xmm4;             ; Filter2 = vp8_filter & hev
    433 
    434         movdqa      xmm5,                   xmm2
    435         paddsb      xmm5,                   [t3 GLOBAL]       ; vp8_signed_char_clamp(Filter2 + 3)
    436 
    437         punpckhbw   xmm7,                   xmm5              ; axbxcxdx
    438         punpcklbw   xmm5,                   xmm5              ; exfxgxhx
    439 
    440         psraw       xmm7,                   11                ; sign extended shift right by 3
    441         psraw       xmm5,                   11                ; sign extended shift right by 3
    442 
    443         packsswb    xmm5,                   xmm7              ; Filter2 >>=3;
    444         paddsb      xmm2,                   [t4 GLOBAL]       ; vp8_signed_char_clamp(Filter2 + 4)
    445 
    446         punpckhbw   xmm7,                   xmm2              ; axbxcxdx
    447         punpcklbw   xmm0,                   xmm2              ; exfxgxhx
    448 
    449         psraw       xmm7,                   11                ; sign extended shift right by 3
    450         psraw       xmm0,                   11                ; sign extended shift right by 3
    451 
    452         packsswb    xmm0,                   xmm7              ; Filter2 >>=3;
    453         paddsb      xmm6,                   xmm5              ; ps0 =ps0 + Fitler2
    454 
    455         psubsb      xmm3,                   xmm0              ; qs0 =qs0 - filter1
    456         pandn       xmm4,                   xmm1              ; vp8_filter&=~hev
    457 %endmacro
    458 
    459 %macro MBH_WRITEBACK 1
    460         ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
    461         ; s = vp8_signed_char_clamp(qs0 - u);
    462         ; *oq0 = s^0x80;
    463         ; s = vp8_signed_char_clamp(ps0 + u);
    464         ; *op0 = s^0x80;
    465         pxor        xmm1,                   xmm1
    466 
    467         pxor        xmm2,                   xmm2
    468         punpcklbw   xmm1,                   xmm4
    469 
    470         punpckhbw   xmm2,                   xmm4
    471         pmulhw      xmm1,                   [s27 GLOBAL]
    472 
    473         pmulhw      xmm2,                   [s27 GLOBAL]
    474         paddw       xmm1,                   [s63 GLOBAL]
    475 
    476         paddw       xmm2,                   [s63 GLOBAL]
    477         psraw       xmm1,                   7
    478 
    479         psraw       xmm2,                   7
    480         packsswb    xmm1,                   xmm2
    481 
    482         psubsb      xmm3,                   xmm1
    483         paddsb      xmm6,                   xmm1
    484 
    485         pxor        xmm3,                   [t80 GLOBAL]
    486         pxor        xmm6,                   [t80 GLOBAL]
    487 
    488 %if %1
    489         movdqa      XMMWORD PTR [rsi+rax],  xmm6
    490         movdqa      XMMWORD PTR [rsi],      xmm3
    491 %else
    492         lea         rsi,                    [rsi + rcx*2]
    493         lea         rdi,                    [rdi + rcx*2]
    494 
    495         movq        MMWORD PTR [rsi],       xmm6              ; p0
    496         psrldq      xmm6,                   8
    497         movq        MMWORD PTR [rdi],       xmm6
    498         movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
    499         psrldq      xmm3,                   8
    500         movq        MMWORD PTR [rdi + rcx], xmm3
    501 %endif
    502 
    503         ; roughly 2/7th difference across boundary
    504         ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
    505         ; s = vp8_signed_char_clamp(qs1 - u);
    506         ; *oq1 = s^0x80;
    507         ; s = vp8_signed_char_clamp(ps1 + u);
    508         ; *op1 = s^0x80;
    509         pxor        xmm1,                   xmm1
    510         pxor        xmm2,                   xmm2
    511 
    512         punpcklbw   xmm1,                   xmm4
    513         punpckhbw   xmm2,                   xmm4
    514 
    515         pmulhw      xmm1,                   [s18 GLOBAL]
    516         pmulhw      xmm2,                   [s18 GLOBAL]
    517 
    518         paddw       xmm1,                   [s63 GLOBAL]
    519         paddw       xmm2,                   [s63 GLOBAL]
    520 
    521         psraw       xmm1,                   7
    522         psraw       xmm2,                   7
    523 
    524         packsswb    xmm1,                   xmm2
    525 
    526 %if %1
    527         movdqa      xmm3,                   XMMWORD PTR [rdi]
    528         movdqa      xmm6,                   XMMWORD PTR [rsi+rax*2] ; p1
    529 %else
    530         movdqa      xmm3,                   q1                ; q1
    531         movdqa      xmm6,                   p1                ; p1
    532 %endif
    533 
    534         pxor        xmm3,                   [t80 GLOBAL]
    535         pxor        xmm6,                   [t80 GLOBAL]
    536 
    537         paddsb      xmm6,                   xmm1
    538         psubsb      xmm3,                   xmm1
    539 
    540         pxor        xmm6,                   [t80 GLOBAL]
    541         pxor        xmm3,                   [t80 GLOBAL]
    542 
    543 %if %1
    544         movdqa      XMMWORD PTR [rdi],      xmm3
    545         movdqa      XMMWORD PTR [rsi+rax*2],xmm6
    546 %else
    547         movq        MMWORD PTR [rsi + rcx*2],xmm3             ; q1
    548         psrldq      xmm3,                   8
    549         movq        MMWORD PTR [rdi + rcx*2],xmm3
    550 
    551         movq        MMWORD PTR [rsi + rax], xmm6              ; p1
    552         psrldq      xmm6,                   8
    553         movq        MMWORD PTR [rdi + rax], xmm6
    554 %endif
    555         ; roughly 1/7th difference across boundary
    556         ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
    557         ; s = vp8_signed_char_clamp(qs2 - u);
    558         ; *oq2 = s^0x80;
    559         ; s = vp8_signed_char_clamp(ps2 + u);
    560         ; *op2 = s^0x80;
    561         pxor        xmm1,                   xmm1
    562         pxor        xmm2,                   xmm2
    563 
    564         punpcklbw   xmm1,                   xmm4
    565         punpckhbw   xmm2,                   xmm4
    566 
    567         pmulhw      xmm1,                   [s9 GLOBAL]
    568         pmulhw      xmm2,                   [s9 GLOBAL]
    569 
    570         paddw       xmm1,                   [s63 GLOBAL]
    571         paddw       xmm2,                   [s63 GLOBAL]
    572 
    573         psraw       xmm1,                   7
    574         psraw       xmm2,                   7
    575 
    576         packsswb    xmm1,                   xmm2
    577 
    578 %if %1
    579         movdqa      xmm6,                   XMMWORD PTR [rdi+rax*4]
    580         neg         rax
    581 
    582         movdqa      xmm3,                   XMMWORD PTR [rdi+rax]
    583 %else
    584         movdqa      xmm6,                   p2                ; p2
    585         movdqa      xmm3,                   q2                ; q2
    586 %endif
    587 
    588         pxor        xmm6,                   [t80 GLOBAL]
    589         pxor        xmm3,                   [t80 GLOBAL]
    590 
    591         paddsb      xmm6,                   xmm1
    592         psubsb      xmm3,                   xmm1
    593 
    594         pxor        xmm6,                   [t80 GLOBAL]
    595         pxor        xmm3,                   [t80 GLOBAL]
    596 %if %1
    597         movdqa      XMMWORD PTR [rdi+rax  ],xmm3
    598         neg         rax
    599 
    600         movdqa      XMMWORD PTR [rdi+rax*4],xmm6
    601 %else
    602         movq        MMWORD PTR [rsi+rax*2], xmm6              ; p2
    603         psrldq      xmm6,                   8
    604         movq        MMWORD PTR [rdi+rax*2], xmm6
    605 
    606         lea         rsi,                    [rsi + rcx]
    607         lea         rdi,                    [rdi + rcx]
    608         movq        MMWORD PTR [rsi+rcx*2  ],xmm3             ; q2
    609         psrldq      xmm3,                   8
    610         movq        MMWORD PTR [rdi+rcx*2  ],xmm3
    611 %endif
    612 %endmacro
    613 
    614 
    615 ;void vp8_mbloop_filter_horizontal_edge_sse2
    616 ;(
    617 ;    unsigned char *src_ptr,
    618 ;    int            src_pixel_step,
    619 ;    const char    *flimit,
    620 ;    const char    *limit,
    621 ;    const char    *thresh,
    622 ;    int            count
    623 ;)
    624 global sym(vp8_mbloop_filter_horizontal_edge_sse2)
    625 sym(vp8_mbloop_filter_horizontal_edge_sse2):
    626     push        rbp
    627     mov         rbp, rsp
    628     SHADOW_ARGS_TO_STACK 6
    629     SAVE_XMM
    630     GET_GOT     rbx
    631     push        rsi
    632     push        rdi
    633     ; end prolog
    634 
    635     ALIGN_STACK 16, rax
    636     sub         rsp, 32     ; reserve 32 bytes
    637     %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
    638     %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
    639 
    640         mov         rsi,                    arg(0)            ;src_ptr
    641         movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step
    642 
    643         mov         rdx,                    arg(3)            ;limit
    644         movdqa      xmm7,                   XMMWORD PTR [rdx]
    645 
    646         lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing
    647 
    648         ; calculate breakout conditions
    649         LFH_FILTER_MASK 1
    650 
    651         ; calculate high edge variance
    652         LFH_HEV_MASK
    653 
    654         ; start work on filters
    655         MBH_FILTER 1
    656         ; write back the result
    657         MBH_WRITEBACK 1
    658 
    659     add rsp, 32
    660     pop rsp
    661     ; begin epilog
    662     pop rdi
    663     pop rsi
    664     RESTORE_GOT
    665     RESTORE_XMM
    666     UNSHADOW_ARGS
    667     pop         rbp
    668     ret
    669 
    670 
    671 ;void vp8_mbloop_filter_horizontal_edge_uv_sse2
    672 ;(
    673 ;    unsigned char *u,
    674 ;    int            src_pixel_step,
    675 ;    const char    *flimit,
    676 ;    const char    *limit,
    677 ;    const char    *thresh,
    678 ;    unsigned char *v
    679 ;)
    680 global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2)
    681 sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
    682     push        rbp
    683     mov         rbp, rsp
    684     SHADOW_ARGS_TO_STACK 6
    685     SAVE_XMM
    686     GET_GOT     rbx
    687     push        rsi
    688     push        rdi
    689     ; end prolog
    690 
    691     ALIGN_STACK 16, rax
    692     sub         rsp, 96       ; reserve 96 bytes
    693     %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
    694     %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
    695     %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
    696     %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
    697     %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
    698     %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
    699 
    700         mov         rsi,                    arg(0)             ; u
    701         mov         rdi,                    arg(5)             ; v
    702         movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
    703         mov         rcx,                    rax
    704         neg         rax                     ; negate pitch to deal with above border
    705 
    706         mov         rdx,                    arg(3)             ;limit
    707         movdqa      xmm7,                   XMMWORD PTR [rdx]
    708 
    709         lea         rsi,                    [rsi + rcx]
    710         lea         rdi,                    [rdi + rcx]
    711 
    712         ; calculate breakout conditions
    713         LFH_FILTER_MASK 0
    714 
    715         ; calculate high edge variance
    716         LFH_HEV_MASK
    717 
    718         ; start work on filters
    719         MBH_FILTER 0
    720         ; write back the result
    721         MBH_WRITEBACK 0
    722 
    723     add rsp, 96
    724     pop rsp
    725     ; begin epilog
    726     pop rdi
    727     pop rsi
    728     RESTORE_GOT
    729     RESTORE_XMM
    730     UNSHADOW_ARGS
    731     pop         rbp
    732     ret
    733 
    734 
    735 %macro TRANSPOSE_16X8_1 0
    736         movq        xmm4,               QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
    737         movq        xmm7,               QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
    738 
    739         punpcklbw   xmm4,               xmm7            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
    740         movq        xmm0,               QWORD PTR [rsi+2*rax]  ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
    741 
    742         movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
    743 
    744         movq        xmm7,               QWORD PTR [rdi+2*rax]  ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
    745         punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
    746 
    747         movq        xmm5,               QWORD PTR [rsi+4*rax]  ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
    748         movq        xmm2,               QWORD PTR [rdi+4*rax]  ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
    749 
    750         punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
    751         movq        xmm7,               QWORD PTR [rsi+2*rcx]  ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
    752 
    753         movq        xmm1,               QWORD PTR [rdi+2*rcx]  ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
    754         movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
    755 
    756         punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
    757         punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
    758 
    759         punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
    760 
    761         punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
    762         punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
    763 
    764         movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
    765         movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
    766 
    767         punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
    768         punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
    769 
    770         punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
    771         punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
    772 
    773         movdqa      t0,                 xmm2            ; save to free XMM2
    774 %endmacro
    775 
    776 %macro TRANSPOSE_16X8_2 1
    777         movq        xmm2,               QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
    778         movq        xmm5,               QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
    779 
    780         punpcklbw   xmm2,               xmm5            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
    781         movq        xmm0,               QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
    782 
    783         movq        xmm5,               QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
    784         punpcklbw   xmm0,               xmm5                  ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
    785 
    786         movq        xmm1,               QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
    787         movq        xmm6,               QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
    788 
    789         punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
    790         movq        xmm5,               QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
    791 
    792         movq        xmm6,               QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
    793         punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
    794 
    795         movdqa      xmm6,               xmm1            ;
    796         punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
    797 
    798         punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
    799         movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
    800 
    801         punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
    802         punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
    803 
    804         movdqa      xmm0,               xmm5
    805         punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
    806 
    807 
    808         punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
    809         movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
    810 
    811         punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
    812         punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
    813 
    814         movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
    815         punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
    816 
    817         punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
    818 %if %1
    819         movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
    820 
    821         punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
    822 
    823         punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
    824         movdqa      [rdx],              xmm2            ; save 2
    825 
    826         movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
    827         punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
    828 
    829         movdqa      [rdx+16],           xmm3            ; save 3
    830         punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
    831 
    832         movdqa      [rdx+32],           xmm4            ; save 4
    833         movdqa      [rdx+48],           xmm5            ; save 5
    834 
    835         movdqa      xmm1,               t0              ; get
    836         movdqa      xmm2,               xmm1            ;
    837 
    838         punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
    839         punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
    840 %else
    841         movdqa      [rdx+112],          xmm7            ; save 7
    842         movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
    843 
    844         movdqa      [rdx+96],           xmm6            ; save 6
    845         punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
    846 
    847         punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
    848         movdqa      [rdx+32],           xmm2            ; save 2
    849 
    850         movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
    851         punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
    852 
    853         movdqa      [rdx+48],           xmm3            ; save 3
    854         punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
    855 
    856         movdqa      [rdx+64],           xmm4            ; save 4
    857         movdqa      [rdx+80],           xmm5            ; save 5
    858 
    859         movdqa      xmm1,               t0              ; get
    860         movdqa      xmm2,               xmm1
    861 
    862         punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
    863         punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
    864 
    865         movdqa      [rdx+16],           xmm1
    866         movdqa      [rdx],              xmm2
    867 %endif
    868 %endmacro
    869 
    870 %macro LFV_FILTER_MASK 1
    871         movdqa      xmm0,               xmm6            ; q2
    872         psubusb     xmm0,               xmm7            ; q2-q3
    873 
    874         psubusb     xmm7,               xmm6            ; q3-q2
    875         por         xmm7,               xmm0            ; abs (q3-q2)
    876 
    877         movdqa      xmm4,               xmm5            ; q1
    878         psubusb     xmm4,               xmm6            ; q1-q2
    879 
    880         psubusb     xmm6,               xmm5            ; q2-q1
    881         por         xmm6,               xmm4            ; abs (q2-q1)
    882 
    883         movdqa      xmm0,               xmm1
    884 
    885         psubusb     xmm0,               xmm2            ; p2 - p3;
    886         psubusb     xmm2,               xmm1            ; p3 - p2;
    887 
    888         por         xmm0,               xmm2            ; abs(p2-p3)
    889 %if %1
    890         movdqa      xmm2,               [rdx]           ; p1
    891 %else
    892         movdqa      xmm2,               [rdx+32]        ; p1
    893 %endif
    894         movdqa      xmm5,               xmm2            ; p1
    895 
    896         psubusb     xmm5,               xmm1            ; p1-p2
    897         psubusb     xmm1,               xmm2            ; p2-p1
    898 
    899         por         xmm1,               xmm5            ; abs(p2-p1)
    900 
    901         mov         rdx,                arg(3)          ; limit
    902         movdqa      xmm4,               [rdx]           ; limit
    903 
    904         psubusb     xmm7,               xmm4
    905 
    906         psubusb     xmm0,               xmm4            ; abs(p3-p2) > limit
    907         psubusb     xmm1,               xmm4            ; abs(p2-p1) > limit
    908 
    909         psubusb     xmm6,               xmm4            ; abs(q2-q1) > limit
    910         por         xmm7,               xmm6            ; or
    911 
    912         por         xmm0,               xmm1
    913         por         xmm0,               xmm7            ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
    914 
    915         movdqa      xmm1,               xmm2            ; p1
    916 
    917         movdqa      xmm7,               xmm3            ; p0
    918         psubusb     xmm7,               xmm2            ; p0-p1
    919 
    920         psubusb     xmm2,               xmm3            ; p1-p0
    921         por         xmm2,               xmm7            ; abs(p1-p0)
    922 
    923         movdqa      t0,                 xmm2            ; save abs(p1-p0)
    924         lea         rdx,                srct
    925 
    926         psubusb     xmm2,               xmm4            ; abs(p1-p0)>limit
    927         por         xmm0,               xmm2            ; mask
    928 %if %1
    929         movdqa      xmm5,               [rdx+32]        ; q0
    930         movdqa      xmm7,               [rdx+48]        ; q1
    931 %else
    932         movdqa      xmm5,               [rdx+64]        ; q0
    933         movdqa      xmm7,               [rdx+80]        ; q1
    934 %endif
    935         movdqa      xmm6,               xmm5            ; q0
    936         movdqa      xmm2,               xmm7            ; q1
    937         psubusb     xmm5,               xmm7            ; q0-q1
    938 
    939         psubusb     xmm7,               xmm6            ; q1-q0
    940         por         xmm7,               xmm5            ; abs(q1-q0)
    941 
    942         movdqa      t1,                 xmm7            ; save abs(q1-q0)
    943         psubusb     xmm7,               xmm4            ; abs(q1-q0)> limit
    944 
    945         por         xmm0,               xmm7            ; mask
    946 
    947         movdqa      xmm5,               xmm2            ; q1
    948         psubusb     xmm5,               xmm1            ; q1-=p1
    949         psubusb     xmm1,               xmm2            ; p1-=q1
    950         por         xmm5,               xmm1            ; abs(p1-q1)
    951         pand        xmm5,               [tfe GLOBAL]    ; set lsb of each byte to zero
    952         psrlw       xmm5,               1               ; abs(p1-q1)/2
    953 
    954         mov         rdx,                arg(2)          ; flimit
    955         movdqa      xmm2,               [rdx]           ; flimit
    956 
    957         movdqa      xmm1,               xmm3            ; p0
    958         movdqa      xmm7,               xmm6            ; q0
    959         psubusb     xmm1,               xmm7            ; p0-q0
    960         psubusb     xmm7,               xmm3            ; q0-p0
    961         por         xmm1,               xmm7            ; abs(q0-p0)
    962         paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
    963         paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
    964 
    965         paddb       xmm2,               xmm2            ; flimit*2 (less than 255)
    966         paddb       xmm4,               xmm2            ; flimit * 2 + limit (less than 255)
    967 
    968         psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
    969         por         xmm1,               xmm0;           ; mask
    970         pxor        xmm0,               xmm0
    971         pcmpeqb     xmm1,               xmm0
    972 %endmacro
    973 
    974 %macro LFV_HEV_MASK 0
    975         mov         rdx,                arg(4)          ; get thresh
    976         movdqa      xmm7,               XMMWORD PTR [rdx]
    977 
    978         movdqa      xmm4,               t0              ; get abs (q1 - q0)
    979         psubusb     xmm4,               xmm7            ; abs(q1 - q0) > thresh
    980 
    981         movdqa      xmm3,               t1              ; get abs (p1 - p0)
    982         psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh
    983 
    984         por         xmm4,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
    985         pcmpeqb     xmm4,               xmm0
    986 
    987         pcmpeqb     xmm0,               xmm0
    988         pxor        xmm4,               xmm0
    989 %endmacro
    990 
    991 %macro BV_FILTER 0
    992         lea         rdx,                srct
    993 
    994         movdqa      xmm2,               [rdx]           ; p1        lea         rsi,       [rsi+rcx*8]
    995         movdqa      xmm7,               [rdx+48]        ; q1
    996         movdqa      xmm6,               [rdx+16]        ; p0
    997         movdqa      xmm0,               [rdx+32]        ; q0
    998 
    999         pxor        xmm2,               [t80 GLOBAL]    ; p1 offset to convert to signed values
   1000         pxor        xmm7,               [t80 GLOBAL]    ; q1 offset to convert to signed values
   1001 
   1002         psubsb      xmm2,               xmm7            ; p1 - q1
   1003         pand        xmm2,               xmm4            ; high var mask (hvm)(p1 - q1)
   1004 
   1005         pxor        xmm6,               [t80 GLOBAL]    ; offset to convert to signed values
   1006         pxor        xmm0,               [t80 GLOBAL]    ; offset to convert to signed values
   1007 
   1008         movdqa      xmm3,               xmm0            ; q0
   1009         psubsb      xmm0,               xmm6            ; q0 - p0
   1010 
   1011         paddsb      xmm2,               xmm0            ; 1 * (q0 - p0) + hvm(p1 - q1)
   1012         paddsb      xmm2,               xmm0            ; 2 * (q0 - p0) + hvm(p1 - q1)
   1013 
   1014         paddsb      xmm2,               xmm0            ; 3 * (q0 - p0) + hvm(p1 - q1)
   1015         pand        xmm1,               xmm2            ; mask filter values we don't care about
   1016 
   1017         movdqa      xmm2,               xmm1
   1018         paddsb      xmm1,               [t4 GLOBAL]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
   1019 
   1020         paddsb      xmm2,               [t3 GLOBAL]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
   1021 
   1022         punpckhbw   xmm5,               xmm2
   1023         punpcklbw   xmm2,               xmm2
   1024 
   1025         psraw       xmm5,               11
   1026         psraw       xmm2,               11
   1027 
   1028         packsswb    xmm2,               xmm5            ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
   1029         punpcklbw   xmm0,               xmm1            ; exfxgxhx
   1030 
   1031         punpckhbw   xmm1,               xmm1            ; axbxcxdx
   1032         psraw       xmm0,               11              ; sign extended shift right by 3
   1033 
   1034         psraw       xmm1,               11              ; sign extended shift right by 3
   1035         movdqa      xmm5,               xmm0            ; save results
   1036 
   1037         packsswb    xmm0,               xmm1            ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
   1038         paddsw      xmm5,               [ones GLOBAL]
   1039 
   1040         paddsw      xmm1,               [ones GLOBAL]
   1041         psraw       xmm5,               1               ; partial shifted one more time for 2nd tap
   1042 
   1043         psraw       xmm1,               1               ; partial shifted one more time for 2nd tap
   1044         packsswb    xmm5,               xmm1            ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
   1045 
   1046         pandn       xmm4,               xmm5            ; high edge variance additive
   1047 
   1048         paddsb      xmm6,               xmm2            ; p0+= p0 add
   1049         pxor        xmm6,               [t80 GLOBAL]    ; unoffset
   1050 
   1051         movdqa      xmm1,               [rdx]           ; p1
   1052         pxor        xmm1,               [t80 GLOBAL]    ; reoffset
   1053 
   1054         paddsb      xmm1,               xmm4            ; p1+= p1 add
   1055         pxor        xmm1,               [t80 GLOBAL]    ; unoffset
   1056 
   1057         psubsb      xmm3,               xmm0            ; q0-= q0 add
   1058         pxor        xmm3,               [t80 GLOBAL]    ; unoffset
   1059 
   1060         psubsb      xmm7,               xmm4            ; q1-= q1 add
   1061         pxor        xmm7,               [t80 GLOBAL]    ; unoffset
   1062 %endmacro
   1063 
   1064 %macro BV_TRANSPOSE 0
   1065         ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1066         ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
   1067         ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
   1068         ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
   1069         movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1070         punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
   1071 
   1072         movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
   1073         punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
   1074 
   1075         punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
   1076         punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
   1077 
   1078         movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
   1079         punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
   1080 
   1081         punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
   1082         movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
   1083 
   1084         punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
   1085         punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
   1086         ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
   1087         ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
   1088         ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
   1089         ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
   1090 %endmacro
   1091 
   1092 %macro BV_WRITEBACK 2
   1093         movd        [rsi+2],            %1
   1094         psrldq      %1,                 4
   1095 
   1096         movd        [rdi+2],            %1
   1097         psrldq      %1,                 4
   1098 
   1099         movd        [rsi+2*rax+2],      %1
   1100         psrldq      %1,                 4
   1101 
   1102         movd        [rdi+2*rax+2],      %1
   1103 
   1104         movd        [rsi+4*rax+2],      %2
   1105         psrldq      %2,                 4
   1106 
   1107         movd        [rdi+4*rax+2],      %2
   1108         psrldq      %2,                 4
   1109 
   1110         movd        [rsi+2*rcx+2],      %2
   1111         psrldq      %2,                 4
   1112 
   1113         movd        [rdi+2*rcx+2],      %2
   1114 %endmacro
   1115 
   1116 
   1117 ;void vp8_loop_filter_vertical_edge_sse2
   1118 ;(
   1119 ;    unsigned char *src_ptr,
   1120 ;    int            src_pixel_step,
   1121 ;    const char    *flimit,
   1122 ;    const char    *limit,
   1123 ;    const char    *thresh,
   1124 ;    int            count
   1125 ;)
   1126 global sym(vp8_loop_filter_vertical_edge_sse2)
   1127 sym(vp8_loop_filter_vertical_edge_sse2):
   1128     push        rbp
   1129     mov         rbp, rsp
   1130     SHADOW_ARGS_TO_STACK 6
   1131     SAVE_XMM
   1132     GET_GOT     rbx
   1133     push        rsi
   1134     push        rdi
   1135     ; end prolog
   1136 
   1137     ALIGN_STACK 16, rax
   1138     sub             rsp, 96      ; reserve 96 bytes
   1139     %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
   1140     %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
   1141     %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
   1142 
   1143         mov         rsi,        arg(0)                  ; src_ptr
   1144         movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
   1145 
   1146         lea         rsi,        [rsi - 4]
   1147         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
   1148         lea         rcx,        [rax*2+rax]
   1149 
   1150         ;transpose 16x8 to 8x16, and store the 8-line result on stack.
   1151         TRANSPOSE_16X8_1
   1152 
   1153         lea         rsi,        [rsi+rax*8]
   1154         lea         rdi,        [rdi+rax*8]
   1155         lea         rdx,        srct
   1156         TRANSPOSE_16X8_2 1
   1157 
   1158         ; calculate filter mask
   1159         LFV_FILTER_MASK 1
   1160         ; calculate high edge variance
   1161         LFV_HEV_MASK
   1162 
   1163         ; start work on filters
   1164         BV_FILTER
   1165 
   1166         ; tranpose and write back - only work on q1, q0, p0, p1
   1167         BV_TRANSPOSE
   1168         ; store 16-line result
   1169 
   1170         lea         rdx,        [rax]
   1171         neg         rdx
   1172 
   1173         BV_WRITEBACK xmm1, xmm5
   1174 
   1175         lea         rsi,        [rsi+rdx*8]
   1176         lea         rdi,        [rdi+rdx*8]
   1177         BV_WRITEBACK xmm2, xmm6
   1178 
   1179     add rsp, 96
   1180     pop rsp
   1181     ; begin epilog
   1182     pop rdi
   1183     pop rsi
   1184     RESTORE_GOT
   1185     RESTORE_XMM
   1186     UNSHADOW_ARGS
   1187     pop         rbp
   1188     ret
   1189 
   1190 
   1191 ;void vp8_loop_filter_vertical_edge_uv_sse2
   1192 ;(
   1193 ;    unsigned char *u,
   1194 ;    int            src_pixel_step,
   1195 ;    const char    *flimit,
   1196 ;    const char    *limit,
   1197 ;    const char    *thresh,
   1198 ;    unsigned char *v
   1199 ;)
   1200 global sym(vp8_loop_filter_vertical_edge_uv_sse2)
   1201 sym(vp8_loop_filter_vertical_edge_uv_sse2):
   1202     push        rbp
   1203     mov         rbp, rsp
   1204     SHADOW_ARGS_TO_STACK 6
   1205     SAVE_XMM
   1206     GET_GOT     rbx
   1207     push        rsi
   1208     push        rdi
   1209     ; end prolog
   1210 
   1211     ALIGN_STACK 16, rax
   1212     sub             rsp, 96      ; reserve 96 bytes
   1213     %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
   1214     %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
   1215     %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
   1216 
   1217         mov         rsi,        arg(0)                  ; u_ptr
   1218         movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
   1219 
   1220         lea         rsi,        [rsi - 4]
   1221         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
   1222         lea         rcx,        [rax+2*rax]
   1223 
   1224         ;transpose 16x8 to 8x16, and store the 8-line result on stack.
   1225         TRANSPOSE_16X8_1
   1226 
   1227         mov         rsi,        arg(5)                  ; v_ptr
   1228         lea         rsi,        [rsi - 4]
   1229         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
   1230 
   1231         lea         rdx,        srct
   1232         TRANSPOSE_16X8_2 1
   1233 
   1234         ; calculate filter mask
   1235         LFV_FILTER_MASK 1
   1236         ; calculate high edge variance
   1237         LFV_HEV_MASK
   1238 
   1239         ; start work on filters
   1240         BV_FILTER
   1241 
   1242         ; tranpose and write back - only work on q1, q0, p0, p1
   1243         BV_TRANSPOSE
   1244 
   1245         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
   1246 
   1247         ; store 16-line result
   1248         BV_WRITEBACK xmm1, xmm5
   1249 
   1250         mov         rsi,        arg(0)                  ; u_ptr
   1251         lea         rsi,        [rsi - 4]
   1252         lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
   1253         BV_WRITEBACK xmm2, xmm6
   1254 
   1255     add rsp, 96
   1256     pop rsp
   1257     ; begin epilog
   1258     pop rdi
   1259     pop rsi
   1260     RESTORE_GOT
   1261     RESTORE_XMM
   1262     UNSHADOW_ARGS
   1263     pop         rbp
   1264     ret
   1265 
   1266 
   1267 %macro MBV_FILTER 0
   1268         lea         rdx,                srct
   1269 
   1270         movdqa      xmm2,               [rdx+32]        ; p1
   1271         movdqa      xmm7,               [rdx+80]        ; q1
   1272         movdqa      xmm6,               [rdx+48]        ; p0
   1273         movdqa      xmm0,               [rdx+64]        ; q0
   1274 
   1275         pxor        xmm2,               [t80 GLOBAL]    ; p1 offset to convert to signed values
   1276         pxor        xmm7,               [t80 GLOBAL]    ; q1 offset to convert to signed values
   1277         pxor        xmm6,               [t80 GLOBAL]    ; offset to convert to signed values
   1278         pxor        xmm0,               [t80 GLOBAL]    ; offset to convert to signed values
   1279 
   1280         psubsb      xmm2,               xmm7            ; p1 - q1
   1281 
   1282         movdqa      xmm3,               xmm0            ; q0
   1283 
   1284         psubsb      xmm0,               xmm6            ; q0 - p0
   1285         paddsb      xmm2,               xmm0            ; 1 * (q0 - p0) + (p1 - q1)
   1286 
   1287         paddsb      xmm2,               xmm0            ; 2 * (q0 - p0)
   1288         paddsb      xmm2,               xmm0            ; 3 * (q0 - p0)+ (p1 - q1)
   1289 
   1290         pand        xmm1,               xmm2            ; mask filter values we don't care about
   1291 
   1292         movdqa      xmm2,               xmm1            ; vp8_filter
   1293         pand        xmm2,               xmm4;           ; Filter2 = vp8_filter & hev
   1294 
   1295         movdqa      xmm5,               xmm2
   1296         paddsb      xmm5,               [t3 GLOBAL]
   1297 
   1298         punpckhbw   xmm7,               xmm5            ; axbxcxdx
   1299         punpcklbw   xmm5,               xmm5            ; exfxgxhx
   1300 
   1301         psraw       xmm7,               11              ; sign extended shift right by 3
   1302         psraw       xmm5,               11              ; sign extended shift right by 3
   1303 
   1304         packsswb    xmm5,               xmm7            ; Filter2 >>=3;
   1305 
   1306         paddsb      xmm2,               [t4 GLOBAL]     ; vp8_signed_char_clamp(Filter2 + 4)
   1307 
   1308         punpcklbw   xmm0,               xmm2            ; exfxgxhx
   1309         punpckhbw   xmm7,               xmm2            ; axbxcxdx
   1310 
   1311         psraw       xmm0,               11              ; sign extended shift right by 3
   1312         psraw       xmm7,               11              ; sign extended shift right by 3
   1313 
   1314         packsswb    xmm0,               xmm7            ; Filter2 >>=3;
   1315 
   1316         psubsb      xmm3,               xmm0            ; qs0 =qs0 - filter1
   1317         paddsb      xmm6,               xmm5            ; ps0 =ps0 + Fitler2
   1318 
   1319         ; vp8_filter &= ~hev;
   1320         ; Filter2 = vp8_filter;
   1321         pandn       xmm4,               xmm1            ; vp8_filter&=~hev
   1322 
   1323         ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
   1324         ; s = vp8_signed_char_clamp(qs0 - u);
   1325         ; *oq0 = s^0x80;
   1326         ; s = vp8_signed_char_clamp(ps0 + u);
   1327         ; *op0 = s^0x80;
   1328         pxor        xmm1,               xmm1
   1329 
   1330         pxor        xmm2,               xmm2
   1331         punpcklbw   xmm1,               xmm4
   1332 
   1333         punpckhbw   xmm2,               xmm4
   1334         pmulhw      xmm1,               [s27 GLOBAL]
   1335 
   1336         pmulhw      xmm2,               [s27 GLOBAL]
   1337         paddw       xmm1,               [s63 GLOBAL]
   1338 
   1339         paddw       xmm2,               [s63 GLOBAL]
   1340         psraw       xmm1,               7
   1341 
   1342         psraw       xmm2,               7
   1343         packsswb    xmm1,               xmm2
   1344 
   1345         psubsb      xmm3,               xmm1
   1346         paddsb      xmm6,               xmm1
   1347 
   1348         pxor        xmm3,               [t80 GLOBAL]
   1349         pxor        xmm6,               [t80 GLOBAL]
   1350 
   1351         movdqa      [rdx+48],           xmm6
   1352         movdqa      [rdx+64],           xmm3
   1353 
   1354         ; roughly 2/7th difference across boundary
   1355         ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
   1356         ; s = vp8_signed_char_clamp(qs1 - u);
   1357         ; *oq1 = s^0x80;
   1358         ; s = vp8_signed_char_clamp(ps1 + u);
   1359         ; *op1 = s^0x80;
   1360         pxor        xmm1,               xmm1
   1361         pxor        xmm2,               xmm2
   1362 
   1363         punpcklbw   xmm1,               xmm4
   1364         punpckhbw   xmm2,               xmm4
   1365 
   1366         pmulhw      xmm1,               [s18 GLOBAL]
   1367         pmulhw      xmm2,               [s18 GLOBAL]
   1368 
   1369         paddw       xmm1,               [s63 GLOBAL]
   1370         paddw       xmm2,               [s63 GLOBAL]
   1371 
   1372         psraw       xmm1,               7
   1373         psraw       xmm2,               7
   1374 
   1375         packsswb    xmm1,               xmm2
   1376 
   1377         movdqa      xmm3,               [rdx + 80]              ; q1
   1378         movdqa      xmm6,               [rdx + 32]              ; p1
   1379 
   1380         pxor        xmm3,               [t80 GLOBAL]
   1381         pxor        xmm6,               [t80 GLOBAL]
   1382 
   1383         paddsb      xmm6,               xmm1
   1384         psubsb      xmm3,               xmm1
   1385 
   1386         pxor        xmm6,               [t80 GLOBAL]
   1387         pxor        xmm3,               [t80 GLOBAL]
   1388 
   1389         movdqa      [rdx + 80],         xmm3
   1390         movdqa      [rdx + 32],         xmm6
   1391 
   1392         ; roughly 1/7th difference across boundary
   1393         ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
   1394         ; s = vp8_signed_char_clamp(qs2 - u);
   1395         ; *oq2 = s^0x80;
   1396         ; s = vp8_signed_char_clamp(ps2 + u);
   1397         ; *op2 = s^0x80;
   1398         pxor        xmm1,               xmm1
   1399         pxor        xmm2,               xmm2
   1400 
   1401         punpcklbw   xmm1,               xmm4
   1402         punpckhbw   xmm2,               xmm4
   1403 
   1404         pmulhw      xmm1,               [s9 GLOBAL]
   1405         pmulhw      xmm2,               [s9 GLOBAL]
   1406 
   1407         paddw       xmm1,               [s63 GLOBAL]
   1408         paddw       xmm2,               [s63 GLOBAL]
   1409 
   1410         psraw       xmm1,               7
   1411         psraw       xmm2,               7
   1412 
   1413         packsswb    xmm1,               xmm2
   1414 
   1415         movdqa      xmm6,               [rdx+16]
   1416         movdqa      xmm3,               [rdx+96]
   1417 
   1418         pxor        xmm6,               [t80 GLOBAL]
   1419         pxor        xmm3,               [t80 GLOBAL]
   1420 
   1421         paddsb      xmm6,               xmm1
   1422         psubsb      xmm3,               xmm1
   1423 
   1424         pxor        xmm6,               [t80 GLOBAL]        ; xmm6 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
   1425         pxor        xmm3,               [t80 GLOBAL]        ; xmm3 = f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 15 06
   1426 %endmacro
   1427 
   1428 %macro MBV_TRANSPOSE 0
   1429         movdqa      xmm0,               [rdx]               ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
   1430         movdqa      xmm1,               xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
   1431 
   1432         punpcklbw   xmm0,               xmm6                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
   1433         punpckhbw   xmm1,               xmm6                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
   1434 
   1435         movdqa      xmm2,               [rdx+32]            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1436         movdqa      xmm6,               xmm2                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1437 
   1438         punpcklbw   xmm2,               [rdx+48]            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
   1439         punpckhbw   xmm6,               [rdx+48]            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
   1440 
   1441         movdqa      xmm5,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
   1442         punpcklwd   xmm0,               xmm2                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
   1443 
   1444         punpckhwd   xmm5,               xmm2                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
   1445         movdqa      xmm4,               xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
   1446 
   1447         punpcklwd   xmm1,               xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
   1448         punpckhwd   xmm4,               xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
   1449 
   1450         movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
   1451         punpcklbw   xmm2,               [rdx+80]            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
   1452 
   1453         movdqa      xmm6,               xmm3                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
   1454         punpcklbw   xmm6,               [rdx+112]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
   1455 
   1456         movdqa      xmm7,               xmm2                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
   1457         punpcklwd   xmm2,               xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
   1458 
   1459         punpckhwd   xmm7,               xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
   1460         movdqa      xmm6,               xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
   1461 
   1462         punpckldq   xmm0,               xmm2                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
   1463         punpckhdq   xmm6,               xmm2                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
   1464 %endmacro
   1465 
   1466 %macro MBV_WRITEBACK_1 0
   1467         movq        QWORD PTR [rsi],    xmm0
   1468         psrldq      xmm0,               8
   1469 
   1470         movq        QWORD PTR [rdi],    xmm0
   1471 
   1472         movq        QWORD PTR [rsi+2*rax], xmm6
   1473         psrldq      xmm6,               8
   1474 
   1475         movq        QWORD PTR [rdi+2*rax], xmm6
   1476 
   1477         movdqa      xmm0,               xmm5                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
   1478         punpckldq   xmm0,               xmm7                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
   1479 
   1480         punpckhdq   xmm5,               xmm7                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
   1481 
   1482         movq        QWORD PTR [rsi+4*rax], xmm0
   1483         psrldq      xmm0,               8
   1484 
   1485         movq        QWORD PTR [rdi+4*rax], xmm0
   1486 
   1487         movq        QWORD PTR [rsi+2*rcx], xmm5
   1488         psrldq      xmm5,               8
   1489 
   1490         movq        QWORD PTR [rdi+2*rcx], xmm5
   1491 
   1492         movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
   1493         punpckhbw   xmm2,               [rdx+80]            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
   1494 
   1495         punpckhbw   xmm3,               [rdx+112]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
   1496         movdqa      xmm0,               xmm2
   1497 
   1498         punpcklwd   xmm0,               xmm3                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
   1499         punpckhwd   xmm2,               xmm3                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
   1500 
   1501         movdqa      xmm3,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
   1502         punpckldq   xmm1,               xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
   1503 
   1504         punpckhdq   xmm3,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
   1505 %endmacro
   1506 
   1507 %macro MBV_WRITEBACK_2 0
   1508         movq        QWORD PTR [rsi], xmm1
   1509         psrldq      xmm1,               8
   1510 
   1511         movq        QWORD PTR [rdi], xmm1
   1512 
   1513         movq        QWORD PTR [rsi+2*rax], xmm3
   1514         psrldq      xmm3,               8
   1515 
   1516         movq        QWORD PTR [rdi+2*rax], xmm3
   1517 
   1518         movdqa      xmm1,               xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
   1519         punpckldq   xmm1,               xmm2                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
   1520 
   1521         punpckhdq   xmm4,               xmm2                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
   1522         movq        QWORD PTR [rsi+4*rax], xmm1
   1523 
   1524         psrldq      xmm1,               8
   1525 
   1526         movq        QWORD PTR [rdi+4*rax], xmm1
   1527 
   1528         movq        QWORD PTR [rsi+2*rcx], xmm4
   1529         psrldq      xmm4,               8
   1530 
   1531         movq        QWORD PTR [rdi+2*rcx], xmm4
   1532 %endmacro
   1533 
   1534 
   1535 ;void vp8_mbloop_filter_vertical_edge_sse2
   1536 ;(
   1537 ;    unsigned char *src_ptr,
   1538 ;    int            src_pixel_step,
   1539 ;    const char    *flimit,
   1540 ;    const char    *limit,
   1541 ;    const char    *thresh,
   1542 ;    int            count
   1543 ;)
   1544 global sym(vp8_mbloop_filter_vertical_edge_sse2)
   1545 sym(vp8_mbloop_filter_vertical_edge_sse2):
   1546     push        rbp
   1547     mov         rbp, rsp
   1548     SHADOW_ARGS_TO_STACK 6
   1549     SAVE_XMM
   1550     GET_GOT     rbx
   1551     push        rsi
   1552     push        rdi
   1553     ; end prolog
   1554 
   1555     ALIGN_STACK 16, rax
   1556     sub          rsp, 160     ; reserve 160 bytes
   1557     %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
   1558     %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
   1559     %define srct [rsp + 32]   ;__declspec(align(16)) char srct[128];
   1560 
   1561         mov         rsi,                arg(0)              ; src_ptr
   1562         movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
   1563 
   1564         lea         rsi,                [rsi - 4]
   1565         lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
   1566         lea         rcx,                [rax*2+rax]
   1567 
   1568         ; Transpose
   1569         TRANSPOSE_16X8_1
   1570 
   1571         lea         rsi,        [rsi+rax*8]
   1572         lea         rdi,        [rdi+rax*8]
   1573         lea         rdx,        srct
   1574         TRANSPOSE_16X8_2 0
   1575 
   1576         ; calculate filter mask
   1577         LFV_FILTER_MASK 0
   1578         ; calculate high edge variance
   1579         LFV_HEV_MASK
   1580 
   1581         neg         rax
   1582         ; start work on filters
   1583         MBV_FILTER
   1584 
   1585         lea         rsi,                [rsi+rax*8]
   1586         lea         rdi,                [rdi+rax*8]
   1587 
   1588         ; transpose and write back
   1589         MBV_TRANSPOSE
   1590 
   1591         neg         rax
   1592 
   1593         MBV_WRITEBACK_1
   1594 
   1595         lea         rsi,                [rsi+rax*8]
   1596         lea         rdi,                [rdi+rax*8]
   1597         MBV_WRITEBACK_2
   1598 
   1599     add rsp, 160
   1600     pop rsp
   1601     ; begin epilog
   1602     pop rdi
   1603     pop rsi
   1604     RESTORE_GOT
   1605     RESTORE_XMM
   1606     UNSHADOW_ARGS
   1607     pop         rbp
   1608     ret
   1609 
   1610 
   1611 ;void vp8_mbloop_filter_vertical_edge_uv_sse2
   1612 ;(
   1613 ;    unsigned char *u,
   1614 ;    int            src_pixel_step,
   1615 ;    const char    *flimit,
   1616 ;    const char    *limit,
   1617 ;    const char    *thresh,
   1618 ;    unsigned char *v
   1619 ;)
   1620 global sym(vp8_mbloop_filter_vertical_edge_uv_sse2)
   1621 sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
   1622     push        rbp
   1623     mov         rbp, rsp
   1624     SHADOW_ARGS_TO_STACK 6
   1625     SAVE_XMM
   1626     GET_GOT     rbx
   1627     push        rsi
   1628     push        rdi
   1629     ; end prolog
   1630 
   1631     ALIGN_STACK 16, rax
   1632     sub          rsp, 160     ; reserve 160 bytes
   1633     %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
   1634     %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
   1635     %define srct [rsp + 32]   ;__declspec(align(16)) char srct[128];
   1636 
   1637         mov         rsi,                arg(0)              ; u_ptr
   1638         movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
   1639 
   1640         lea         rsi,                [rsi - 4]
   1641         lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
   1642         lea         rcx,                [rax+2*rax]
   1643 
   1644         ; Transpose
   1645         TRANSPOSE_16X8_1
   1646 
   1647         ; XMM3 XMM4 XMM7 in use
   1648         mov         rsi,                arg(5)              ; v_ptr
   1649         lea         rsi,                [rsi - 4]
   1650         lea         rdi,                [rsi + rax]
   1651         lea         rdx,                srct
   1652         TRANSPOSE_16X8_2 0
   1653 
   1654         ; calculate filter mask
   1655         LFV_FILTER_MASK 0
   1656         ; calculate high edge variance
   1657         LFV_HEV_MASK
   1658 
   1659         ; start work on filters
   1660         MBV_FILTER
   1661 
   1662         ; transpose and write back
   1663         MBV_TRANSPOSE
   1664 
   1665         mov         rsi,                arg(0)             ;u_ptr
   1666         lea         rsi,                [rsi - 4]
   1667         lea         rdi,                [rsi + rax]
   1668         MBV_WRITEBACK_1
   1669         mov         rsi,                arg(5)             ;v_ptr
   1670         lea         rsi,                [rsi - 4]
   1671         lea         rdi,                [rsi + rax]
   1672         MBV_WRITEBACK_2
   1673 
   1674     add rsp, 160
   1675     pop rsp
   1676     ; begin epilog
   1677     pop rdi
   1678     pop rsi
   1679     RESTORE_GOT
   1680     RESTORE_XMM
   1681     UNSHADOW_ARGS
   1682     pop         rbp
   1683     ret
   1684 
   1685 
   1686 ;void vp8_loop_filter_simple_horizontal_edge_sse2
   1687 ;(
   1688 ;    unsigned char *src_ptr,
   1689 ;    int  src_pixel_step,
   1690 ;    const char *flimit,
   1691 ;    const char *limit,
   1692 ;    const char *thresh,
   1693 ;    int count
   1694 ;)
   1695 global sym(vp8_loop_filter_simple_horizontal_edge_sse2)
   1696 sym(vp8_loop_filter_simple_horizontal_edge_sse2):
   1697     push        rbp
   1698     mov         rbp, rsp
   1699     SHADOW_ARGS_TO_STACK 6
   1700     SAVE_XMM
   1701     GET_GOT     rbx
   1702     push        rsi
   1703     push        rdi
   1704     ; end prolog
   1705 
   1706         mov         rsi, arg(0)             ;src_ptr
   1707         movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
   1708         mov         rdx, arg(2) ;flimit     ; get flimit
   1709         movdqa      xmm3, XMMWORD PTR [rdx]
   1710         mov         rdx, arg(3) ;limit
   1711         movdqa      xmm7, XMMWORD PTR [rdx]
   1712 
   1713         paddb       xmm3, xmm3              ; flimit*2 (less than 255)
   1714         paddb       xmm3, xmm7              ; flimit * 2 + limit (less than 255)
   1715 
   1716         mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
   1717         add         rdi, rax
   1718         neg         rax
   1719 
   1720         ; calculate mask
   1721         movdqu      xmm1, [rsi+2*rax]       ; p1
   1722         movdqu      xmm0, [rdi]             ; q1
   1723         movdqa      xmm2, xmm1
   1724         movdqa      xmm7, xmm0
   1725         movdqa      xmm4, xmm0
   1726         psubusb     xmm0, xmm1              ; q1-=p1
   1727         psubusb     xmm1, xmm4              ; p1-=q1
   1728         por         xmm1, xmm0              ; abs(p1-q1)
   1729         pand        xmm1, [tfe GLOBAL]      ; set lsb of each byte to zero
   1730         psrlw       xmm1, 1                 ; abs(p1-q1)/2
   1731 
   1732         movdqu      xmm5, [rsi+rax]         ; p0
   1733         movdqu      xmm4, [rsi]             ; q0
   1734         movdqa      xmm0, xmm4              ; q0
   1735         movdqa      xmm6, xmm5              ; p0
   1736         psubusb     xmm5, xmm4              ; p0-=q0
   1737         psubusb     xmm4, xmm6              ; q0-=p0
   1738         por         xmm5, xmm4              ; abs(p0 - q0)
   1739         paddusb     xmm5, xmm5              ; abs(p0-q0)*2
   1740         paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
   1741 
   1742         psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
   1743         pxor        xmm3, xmm3
   1744         pcmpeqb     xmm5, xmm3
   1745 
   1746         ; start work on filters
   1747         pxor        xmm2, [t80 GLOBAL]      ; p1 offset to convert to signed values
   1748         pxor        xmm7, [t80 GLOBAL]      ; q1 offset to convert to signed values
   1749         psubsb      xmm2, xmm7              ; p1 - q1
   1750 
   1751         pxor        xmm6, [t80 GLOBAL]      ; offset to convert to signed values
   1752         pxor        xmm0, [t80 GLOBAL]      ; offset to convert to signed values
   1753         movdqa      xmm3, xmm0              ; q0
   1754         psubsb      xmm0, xmm6              ; q0 - p0
   1755         paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
   1756         paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
   1757         paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
   1758         pand        xmm5, xmm2              ; mask filter values we don't care about
   1759 
   1760         ; do + 4 side
   1761         paddsb      xmm5, [t4 GLOBAL]       ; 3* (q0 - p0) + (p1 - q1) + 4
   1762 
   1763         movdqa      xmm0, xmm5              ; get a copy of filters
   1764         psllw       xmm0, 8                 ; shift left 8
   1765         psraw       xmm0, 3                 ; arithmetic shift right 11
   1766         psrlw       xmm0, 8
   1767         movdqa      xmm1, xmm5              ; get a copy of filters
   1768         psraw       xmm1, 11                ; arithmetic shift right 11
   1769         psllw       xmm1, 8                 ; shift left 8 to put it back
   1770 
   1771         por         xmm0, xmm1              ; put the two together to get result
   1772 
   1773         psubsb      xmm3, xmm0              ; q0-= q0 add
   1774         pxor        xmm3, [t80 GLOBAL]      ; unoffset
   1775         movdqu      [rsi], xmm3             ; write back
   1776 
   1777         ; now do +3 side
   1778         psubsb      xmm5, [t1s GLOBAL]      ; +3 instead of +4
   1779 
   1780         movdqa      xmm0, xmm5              ; get a copy of filters
   1781         psllw       xmm0, 8                 ; shift left 8
   1782         psraw       xmm0, 3                 ; arithmetic shift right 11
   1783         psrlw       xmm0, 8
   1784         psraw       xmm5, 11                ; arithmetic shift right 11
   1785         psllw       xmm5, 8                 ; shift left 8 to put it back
   1786         por         xmm0, xmm5              ; put the two together to get result
   1787 
   1788 
   1789         paddsb      xmm6, xmm0              ; p0+= p0 add
   1790         pxor        xmm6, [t80 GLOBAL]      ; unoffset
   1791         movdqu      [rsi+rax], xmm6         ; write back
   1792 
   1793     ; begin epilog
   1794     pop rdi
   1795     pop rsi
   1796     RESTORE_GOT
   1797     RESTORE_XMM
   1798     UNSHADOW_ARGS
   1799     pop         rbp
   1800     ret
   1801 
   1802 
   1803 ;void vp8_loop_filter_simple_vertical_edge_sse2
   1804 ;(
   1805 ;    unsigned char *src_ptr,
   1806 ;    int  src_pixel_step,
   1807 ;    const char *flimit,
   1808 ;    const char *limit,
   1809 ;    const char *thresh,
   1810 ;    int count
   1811 ;)
   1812 global sym(vp8_loop_filter_simple_vertical_edge_sse2)
   1813 sym(vp8_loop_filter_simple_vertical_edge_sse2):
   1814     push        rbp         ; save old base pointer value.
   1815     mov         rbp, rsp    ; set new base pointer value.
   1816     SHADOW_ARGS_TO_STACK 6
   1817     SAVE_XMM
   1818     GET_GOT     rbx         ; save callee-saved reg
   1819     push        rsi
   1820     push        rdi
   1821     ; end prolog
   1822 
   1823     ALIGN_STACK 16, rax
   1824     sub         rsp, 32                         ; reserve 32 bytes
   1825     %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
   1826     %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
   1827 
   1828         mov         rsi, arg(0) ;src_ptr
   1829         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
   1830 
   1831         lea         rsi,        [rsi - 2 ]
   1832         lea         rdi,        [rsi + rax]
   1833         lea         rdx,        [rsi + rax*4]
   1834         lea         rcx,        [rdx + rax]
   1835 
   1836         movdqu      xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
   1837         movdqu      xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
   1838         movdqu      xmm2,       [rdi]                   ; 13 12 11 10
   1839         movdqu      xmm3,       [rcx]                   ; 53 52 51 50
   1840         punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
   1841         punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
   1842 
   1843         movdqu      xmm4,       [rsi + rax*2]           ; 23 22 21 20
   1844         movdqu      xmm5,       [rdx + rax*2]           ; 63 62 61 60
   1845         movdqu      xmm6,       [rdi + rax*2]           ; 33 32 31 30
   1846         movdqu      xmm7,       [rcx + rax*2]           ; 73 72 71 70
   1847         punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
   1848         punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
   1849 
   1850         punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
   1851         punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
   1852 
   1853         movdqa      xmm1,       xmm0
   1854         punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
   1855         punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
   1856 
   1857         movdqa      xmm2,       xmm0
   1858         punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
   1859         punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
   1860 
   1861         movdqa      t0,         xmm0                    ; save to t0
   1862         movdqa      t1,         xmm2                    ; save to t1
   1863 
   1864         lea         rsi,        [rsi + rax*8]
   1865         lea         rdi,        [rsi + rax]
   1866         lea         rdx,        [rsi + rax*4]
   1867         lea         rcx,        [rdx + rax]
   1868 
   1869         movdqu      xmm4,       [rsi]                   ; 83 82 81 80
   1870         movdqu      xmm1,       [rdx]                   ; c3 c2 c1 c0
   1871         movdqu      xmm6,       [rdi]                   ; 93 92 91 90
   1872         movdqu      xmm3,       [rcx]                   ; d3 d2 d1 d0
   1873         punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
   1874         punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
   1875 
   1876         movdqu      xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
   1877         movdqu      xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
   1878         movdqu      xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
   1879         movdqu      xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
   1880         punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
   1881         punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
   1882 
   1883         punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
   1884         punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
   1885 
   1886         movdqa      xmm1,       xmm4
   1887         punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
   1888         punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
   1889 
   1890         movdqa      xmm6,       xmm4
   1891         punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
   1892         punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
   1893 
   1894         movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
   1895         movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
   1896         movdqa      xmm1,       xmm0
   1897         movdqa      xmm3,       xmm2
   1898 
   1899         punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
   1900         punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
   1901         punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1902         punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
   1903 
   1904         ; calculate mask
   1905         movdqa      xmm6,       xmm0                            ; p1
   1906         movdqa      xmm7,       xmm3                            ; q1
   1907         psubusb     xmm7,       xmm0                            ; q1-=p1
   1908         psubusb     xmm6,       xmm3                            ; p1-=q1
   1909         por         xmm6,       xmm7                            ; abs(p1-q1)
   1910         pand        xmm6,       [tfe GLOBAL]                    ; set lsb of each byte to zero
   1911         psrlw       xmm6,       1                               ; abs(p1-q1)/2
   1912 
   1913         movdqa      xmm5,       xmm1                            ; p0
   1914         movdqa      xmm4,       xmm2                            ; q0
   1915         psubusb     xmm5,       xmm2                            ; p0-=q0
   1916         psubusb     xmm4,       xmm1                            ; q0-=p0
   1917         por         xmm5,       xmm4                            ; abs(p0 - q0)
   1918         paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
   1919         paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
   1920 
   1921         mov         rdx,        arg(2)                          ;flimit
   1922         movdqa      xmm7, XMMWORD PTR [rdx]
   1923         mov         rdx,        arg(3)                          ; get limit
   1924         movdqa      xmm6, XMMWORD PTR [rdx]
   1925         paddb       xmm7,        xmm7                           ; flimit*2 (less than 255)
   1926         paddb       xmm7,        xmm6                           ; flimit * 2 + limit (less than 255)
   1927 
   1928         psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
   1929         pxor        xmm7,        xmm7
   1930         pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
   1931 
   1932         ; start work on filters
   1933         movdqa        t0,        xmm0
   1934         movdqa        t1,        xmm3
   1935 
   1936         pxor        xmm0,        [t80 GLOBAL]                   ; p1 offset to convert to signed values
   1937         pxor        xmm3,        [t80 GLOBAL]                   ; q1 offset to convert to signed values
   1938 
   1939         psubsb      xmm0,        xmm3                           ; p1 - q1
   1940         movdqa      xmm6,        xmm1                           ; p0
   1941 
   1942         movdqa      xmm7,        xmm2                           ; q0
   1943         pxor        xmm6,        [t80 GLOBAL]                   ; offset to convert to signed values
   1944 
   1945         pxor        xmm7,        [t80 GLOBAL]                   ; offset to convert to signed values
   1946         movdqa      xmm3,        xmm7                           ; offseted ; q0
   1947 
   1948         psubsb      xmm7,        xmm6                           ; q0 - p0
   1949         paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)
   1950 
   1951         paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)
   1952         paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)
   1953 
   1954         pand        xmm5,        xmm0                           ; mask filter values we don't care about
   1955 
   1956 
   1957         paddsb      xmm5,        [t4 GLOBAL]                    ;  3* (q0 - p0) + (p1 - q1) + 4
   1958 
   1959         movdqa      xmm0,        xmm5                           ; get a copy of filters
   1960         psllw       xmm0,        8                              ; shift left 8
   1961 
   1962         psraw       xmm0,        3                              ; arithmetic shift right 11
   1963         psrlw       xmm0,        8
   1964 
   1965         movdqa      xmm7,        xmm5                           ; get a copy of filters
   1966         psraw       xmm7,        11                             ; arithmetic shift right 11
   1967 
   1968         psllw       xmm7,        8                              ; shift left 8 to put it back
   1969         por         xmm0,        xmm7                           ; put the two together to get result
   1970 
   1971         psubsb      xmm3,        xmm0                           ; q0-= q0sz add
   1972         pxor        xmm3,        [t80 GLOBAL]                   ; unoffset   q0
   1973 
   1974         ; now do +3 side
   1975         psubsb      xmm5,        [t1s GLOBAL]                   ; +3 instead of +4
   1976         movdqa      xmm0,        xmm5                           ; get a copy of filters
   1977 
   1978         psllw       xmm0,        8                              ; shift left 8
   1979         psraw       xmm0,        3                              ; arithmetic shift right 11
   1980 
   1981         psrlw       xmm0,        8
   1982         psraw       xmm5,        11                             ; arithmetic shift right 11
   1983 
   1984         psllw       xmm5,        8                              ; shift left 8 to put it back
   1985         por         xmm0,        xmm5                           ; put the two together to get result
   1986 
   1987         paddsb      xmm6,        xmm0                           ; p0+= p0 add
   1988         pxor        xmm6,        [t80 GLOBAL]                   ; unoffset   p0
   1989 
   1990         movdqa      xmm0,        t0                             ; p1
   1991         movdqa      xmm4,        t1                             ; q1
   1992 
   1993         ; transpose back to write out
   1994         ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
   1995         ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
   1996         ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
   1997         ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
   1998         movdqa      xmm1,       xmm0
   1999         punpcklbw   xmm0,       xmm6                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
   2000         punpckhbw   xmm1,       xmm6                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
   2001 
   2002         movdqa      xmm5,       xmm3
   2003         punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
   2004         punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
   2005 
   2006         movdqa      xmm2,       xmm0
   2007         punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
   2008         punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
   2009 
   2010         movdqa      xmm3,       xmm1
   2011         punpcklwd   xmm1,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
   2012         punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
   2013 
   2014         ; write out order: xmm0 xmm2 xmm1 xmm3
   2015         lea         rdx,        [rsi + rax*4]
   2016 
   2017         movd        [rsi],      xmm1                               ; write the second 8-line result
   2018         psrldq      xmm1,       4
   2019         movd        [rdi],      xmm1
   2020         psrldq      xmm1,       4
   2021         movd        [rsi + rax*2], xmm1
   2022         psrldq      xmm1,       4
   2023         movd        [rdi + rax*2], xmm1
   2024 
   2025         movd        [rdx],      xmm3
   2026         psrldq      xmm3,       4
   2027         movd        [rcx],      xmm3
   2028         psrldq      xmm3,       4
   2029         movd        [rdx + rax*2], xmm3
   2030         psrldq      xmm3,       4
   2031         movd        [rcx + rax*2], xmm3
   2032 
   2033         neg         rax
   2034         lea         rsi,        [rsi + rax*8]
   2035         neg         rax
   2036         lea         rdi,        [rsi + rax]
   2037         lea         rdx,        [rsi + rax*4]
   2038         lea         rcx,        [rdx + rax]
   2039 
   2040         movd        [rsi],      xmm0                                ; write the first 8-line result
   2041         psrldq      xmm0,       4
   2042         movd        [rdi],      xmm0
   2043         psrldq      xmm0,       4
   2044         movd        [rsi + rax*2], xmm0
   2045         psrldq      xmm0,       4
   2046         movd        [rdi + rax*2], xmm0
   2047 
   2048         movd        [rdx],      xmm2
   2049         psrldq      xmm2,       4
   2050         movd        [rcx],      xmm2
   2051         psrldq      xmm2,       4
   2052         movd        [rdx + rax*2], xmm2
   2053         psrldq      xmm2,       4
   2054         movd        [rcx + rax*2], xmm2
   2055 
   2056     add rsp, 32
   2057     pop rsp
   2058     ; begin epilog
   2059     pop rdi
   2060     pop rsi
   2061     RESTORE_GOT
   2062     RESTORE_XMM
   2063     UNSHADOW_ARGS
   2064     pop         rbp
   2065     ret
   2066 
   2067 SECTION_RODATA
   2068 align 16
   2069 tfe:
   2070     times 16 db 0xfe
   2071 align 16
   2072 t80:
   2073     times 16 db 0x80
   2074 align 16
   2075 t1s:
   2076     times 16 db 0x01
   2077 align 16
   2078 t3:
   2079     times 16 db 0x03
   2080 align 16
   2081 t4:
   2082     times 16 db 0x04
   2083 align 16
   2084 ones:
   2085     times 8 dw 0x0001
   2086 align 16
   2087 s27:
   2088     times 8 dw 0x1b00
   2089 align 16
   2090 s18:
   2091     times 8 dw 0x1200
   2092 align 16
   2093 s9:
   2094     times 8 dw 0x0900
   2095 align 16
   2096 s63:
   2097     times 8 dw 0x003f
   2098