Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 
     15 ;void vp8_loop_filter_horizontal_edge_mmx
     16 ;(
     17 ;    unsigned char *src_ptr,
     18 ;    int src_pixel_step,
     19 ;    const char *flimit,
     20 ;    const char *limit,
     21 ;    const char *thresh,
     22 ;    int  count
     23 ;)
     24 global sym(vp8_loop_filter_horizontal_edge_mmx)
     25 sym(vp8_loop_filter_horizontal_edge_mmx):
     26     push        rbp
     27     mov         rbp, rsp
     28     SHADOW_ARGS_TO_STACK 6
     29     GET_GOT     rbx
     30     push        rsi
     31     push        rdi
     32     ; end prolog
     33 
     34     ALIGN_STACK 16, rax
     35     sub         rsp, 32                         ; reserve 32 bytes
     36     %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
     37     %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
     38 
     39         mov         rsi, arg(0) ;src_ptr
     40         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
     41 
     42         movsxd      rcx, dword ptr arg(5) ;count
     43 next8_h:
     44         mov         rdx, arg(3) ;limit
     45         movq        mm7, [rdx]
     46         mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
     47         add         rdi, rax
     48 
     49         ; calculate breakout conditions
     50         movq        mm2, [rdi+2*rax]      ; q3
     51         movq        mm1, [rsi+2*rax]      ; q2
     52         movq        mm6, mm1              ; q2
     53         psubusb     mm1, mm2              ; q2-=q3
     54         psubusb     mm2, mm6              ; q3-=q2
     55         por         mm1, mm2              ; abs(q3-q2)
     56         psubusb     mm1, mm7              ;
     57 
     58 
     59         movq        mm4, [rsi+rax]        ; q1
     60         movq        mm3, mm4              ; q1
     61         psubusb     mm4, mm6              ; q1-=q2
     62         psubusb     mm6, mm3              ; q2-=q1
     63         por         mm4, mm6              ; abs(q2-q1)
     64 
     65         psubusb     mm4, mm7
     66         por        mm1, mm4
     67 
     68         movq        mm4, [rsi]            ; q0
     69         movq        mm0, mm4              ; q0
     70         psubusb     mm4, mm3              ; q0-=q1
     71         psubusb     mm3, mm0              ; q1-=q0
     72         por         mm4, mm3              ; abs(q0-q1)
     73         movq        t0, mm4               ; save to t0
     74         psubusb     mm4, mm7
     75         por        mm1, mm4
     76 
     77 
     78         neg         rax                   ; negate pitch to deal with above border
     79 
     80         movq        mm2, [rsi+4*rax]      ; p3
     81         movq        mm4, [rdi+4*rax]      ; p2
     82         movq        mm5, mm4              ; p2
     83         psubusb     mm4, mm2              ; p2-=p3
     84         psubusb     mm2, mm5              ; p3-=p2
     85         por         mm4, mm2              ; abs(p3 - p2)
     86         psubusb     mm4, mm7
     87         por        mm1, mm4
     88 
     89 
     90         movq        mm4, [rsi+2*rax]      ; p1
     91         movq        mm3, mm4              ; p1
     92         psubusb     mm4, mm5              ; p1-=p2
     93         psubusb     mm5, mm3              ; p2-=p1
     94         por         mm4, mm5              ; abs(p2 - p1)
     95         psubusb     mm4, mm7
     96         por        mm1, mm4
     97 
     98         movq        mm2, mm3              ; p1
     99 
    100         movq        mm4, [rsi+rax]        ; p0
    101         movq        mm5, mm4              ; p0
    102         psubusb     mm4, mm3              ; p0-=p1
    103         psubusb     mm3, mm5              ; p1-=p0
    104         por         mm4, mm3              ; abs(p1 - p0)
    105         movq        t1, mm4               ; save to t1
    106         psubusb     mm4, mm7
    107         por        mm1, mm4
    108 
    109         movq        mm3, [rdi]            ; q1
    110         movq        mm4, mm3              ; q1
    111         psubusb     mm3, mm2              ; q1-=p1
    112         psubusb     mm2, mm4              ; p1-=q1
    113         por         mm2, mm3              ; abs(p1-q1)
    114         pand        mm2, [tfe GLOBAL]     ; set lsb of each byte to zero
    115         psrlw       mm2, 1                ; abs(p1-q1)/2
    116 
    117         movq        mm6, mm5              ; p0
    118         movq        mm3, [rsi]            ; q0
    119         psubusb     mm5, mm3              ; p0-=q0
    120         psubusb     mm3, mm6              ; q0-=p0
    121         por         mm5, mm3              ; abs(p0 - q0)
    122         paddusb     mm5, mm5              ; abs(p0-q0)*2
    123         paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
    124 
    125         mov         rdx, arg(2) ;flimit           ; get flimit
    126         movq        mm2, [rdx]            ; flimit mm2
    127         paddb       mm2, mm2              ; flimit*2 (less than 255)
    128         paddb       mm7, mm2              ; flimit * 2 + limit (less than 255)
    129 
    130         psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
    131         por         mm1,    mm5
    132         pxor        mm5,    mm5
    133         pcmpeqb     mm1,    mm5           ; mask mm1
    134 
    135         ; calculate high edge variance
    136         mov         rdx, arg(4) ;thresh           ; get thresh
    137         movq        mm7, [rdx]            ;
    138         movq        mm4, t0               ; get abs (q1 - q0)
    139         psubusb     mm4, mm7
    140         movq        mm3, t1               ; get abs (p1 - p0)
    141         psubusb     mm3, mm7
    142         paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
    143 
    144         pcmpeqb     mm4,        mm5
    145 
    146         pcmpeqb     mm5,        mm5
    147         pxor        mm4,        mm5
    148 
    149 
    150         ; start work on filters
    151         movq        mm2, [rsi+2*rax]      ; p1
    152         movq        mm7, [rdi]            ; q1
    153         pxor        mm2, [t80 GLOBAL]     ; p1 offset to convert to signed values
    154         pxor        mm7, [t80 GLOBAL]     ; q1 offset to convert to signed values
    155         psubsb      mm2, mm7              ; p1 - q1
    156         pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
    157         pxor        mm6, [t80 GLOBAL]     ; offset to convert to signed values
    158         pxor        mm0, [t80 GLOBAL]     ; offset to convert to signed values
    159         movq        mm3, mm0              ; q0
    160         psubsb      mm0, mm6              ; q0 - p0
    161         paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
    162         paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
    163         paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
    164         pand        mm1, mm2                  ; mask filter values we don't care about
    165         movq        mm2, mm1
    166         paddsb      mm1, [t4 GLOBAL]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
    167         paddsb      mm2, [t3 GLOBAL]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
    168 
    169         pxor        mm0, mm0             ;
    170         pxor        mm5, mm5
    171         punpcklbw   mm0, mm2            ;
    172         punpckhbw   mm5, mm2            ;
    173         psraw       mm0, 11             ;
    174         psraw       mm5, 11
    175         packsswb    mm0, mm5
    176         movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
    177 
    178         pxor        mm0, mm0              ; 0
    179         movq        mm5, mm1              ; abcdefgh
    180         punpcklbw   mm0, mm1              ; e0f0g0h0
    181         psraw       mm0, 11               ; sign extended shift right by 3
    182         pxor        mm1, mm1              ; 0
    183         punpckhbw   mm1, mm5              ; a0b0c0d0
    184         psraw       mm1, 11               ; sign extended shift right by 3
    185         movq        mm5, mm0              ; save results
    186 
    187         packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
    188         paddsw      mm5, [ones GLOBAL]
    189         paddsw      mm1, [ones GLOBAL]
    190         psraw       mm5, 1                ; partial shifted one more time for 2nd tap
    191         psraw       mm1, 1                ; partial shifted one more time for 2nd tap
    192         packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
    193         pandn       mm4, mm5              ; high edge variance additive
    194 
    195         paddsb      mm6, mm2              ; p0+= p0 add
    196         pxor        mm6, [t80 GLOBAL]     ; unoffset
    197         movq        [rsi+rax], mm6        ; write back
    198 
    199         movq        mm6, [rsi+2*rax]      ; p1
    200         pxor        mm6, [t80 GLOBAL]     ; reoffset
    201         paddsb      mm6, mm4              ; p1+= p1 add
    202         pxor        mm6, [t80 GLOBAL]     ; unoffset
    203         movq        [rsi+2*rax], mm6      ; write back
    204 
    205         psubsb      mm3, mm0              ; q0-= q0 add
    206         pxor        mm3, [t80 GLOBAL]     ; unoffset
    207         movq        [rsi], mm3            ; write back
    208 
    209         psubsb      mm7, mm4              ; q1-= q1 add
    210         pxor        mm7, [t80 GLOBAL]     ; unoffset
    211         movq        [rdi], mm7            ; write back
    212 
    213         add         rsi,8
    214         neg         rax
    215         dec         rcx
    216         jnz         next8_h
    217 
    218     add rsp, 32
    219     pop rsp
    220     ; begin epilog
    221     pop rdi
    222     pop rsi
    223     RESTORE_GOT
    224     UNSHADOW_ARGS
    225     pop         rbp
    226     ret
    227 
    228 
    229 ;void vp8_loop_filter_vertical_edge_mmx
    230 ;(
    231 ;    unsigned char *src_ptr,
    232 ;    int  src_pixel_step,
    233 ;    const char *flimit,
    234 ;    const char *limit,
    235 ;    const char *thresh,
    236 ;    int count
    237 ;)
    238 global sym(vp8_loop_filter_vertical_edge_mmx)
    239 sym(vp8_loop_filter_vertical_edge_mmx):
    240     push        rbp
    241     mov         rbp, rsp
    242     SHADOW_ARGS_TO_STACK 6
    243     GET_GOT     rbx
    244     push        rsi
    245     push        rdi
    246     ; end prolog
    247 
    248     ALIGN_STACK 16, rax
    249     sub          rsp, 64      ; reserve 64 bytes
    250     %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
    251     %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
    252     %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
    253 
    254         mov         rsi,        arg(0) ;src_ptr
    255         movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
    256 
    257         lea         rsi,        [rsi + rax*4 - 4]
    258 
    259         movsxd      rcx,        dword ptr arg(5) ;count
    260 next8_v:
    261         mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
    262         add         rdi,        rax
    263 
    264 
    265         ;transpose
    266         movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
    267         movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
    268 
    269         punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
    270         punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
    271 
    272         movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
    273         movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
    274 
    275         punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
    276         punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
    277 
    278         movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
    279         punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
    280 
    281         punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
    282         movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
    283 
    284         punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
    285         punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
    286 
    287         neg         rax
    288         movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
    289 
    290         movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
    291         punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
    292 
    293         punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
    294         movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
    295 
    296         punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
    297         movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
    298 
    299         punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
    300         punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
    301 
    302         movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
    303         punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
    304 
    305         punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
    306 
    307         movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
    308         psubusb     mm5,        mm7                         ; q2-q3
    309 
    310         psubusb     mm7,        mm6                         ; q3-q2
    311         por         mm7,        mm5;                        ; mm7=abs (q3-q2)
    312 
    313         movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
    314         punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
    315 
    316         punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
    317         movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
    318 
    319         psubusb     mm3,        mm6                         ; q1-q2
    320         psubusb     mm6,        mm5                         ; q2-q1
    321 
    322         por         mm6,        mm3                         ; mm6=abs(q2-q1)
    323         lea         rdx,        srct
    324 
    325         movq        [rdx+24],   mm5                         ; save q1
    326         movq        [rdx+16],   mm0                         ; save q0
    327 
    328         movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
    329         punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
    330 
    331         movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
    332         punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
    333 
    334         punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
    335         movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
    336 
    337         punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
    338         punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
    339 
    340         movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
    341         psubusb     mm2,        mm0                         ; p2-p3
    342 
    343         psubusb     mm0,        mm1                         ; p3-p2
    344         por         mm0,        mm2                         ; mm0=abs(p3-p2)
    345 
    346         movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
    347         punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
    348 
    349         punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
    350         movq        [rdx+8],    mm3                         ; save p0
    351 
    352         movq        [rdx],      mm2                         ; save p1
    353         movq        mm5,        mm2                         ; mm5 = p1
    354 
    355         psubusb     mm2,        mm1                         ; p1-p2
    356         psubusb     mm1,        mm5                         ; p2-p1
    357 
    358         por         mm1,        mm2                         ; mm1=abs(p2-p1)
    359         mov         rdx,        arg(3) ;limit
    360 
    361         movq        mm4,        [rdx]                       ; mm4 = limit
    362         psubusb     mm7,        mm4
    363 
    364         psubusb     mm0,        mm4
    365         psubusb     mm1,        mm4
    366 
    367         psubusb     mm6,        mm4
    368         por         mm7,        mm6
    369 
    370         por         mm0,        mm1
    371         por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
    372 
    373         movq        mm1,        mm5                         ; p1
    374 
    375         movq        mm7,        mm3                         ; mm3=mm7=p0
    376         psubusb     mm7,        mm5                         ; p0 - p1
    377 
    378         psubusb     mm5,        mm3                         ; p1 - p0
    379         por         mm5,        mm7                         ; abs(p1-p0)
    380 
    381         movq        t0,         mm5                         ; save abs(p1-p0)
    382         lea         rdx,        srct
    383 
    384         psubusb     mm5,        mm4
    385         por         mm0,        mm5                         ; mm0=mask
    386 
    387         movq        mm5,        [rdx+16]                    ; mm5=q0
    388         movq        mm7,        [rdx+24]                    ; mm7=q1
    389 
    390         movq        mm6,        mm5                         ; mm6=q0
    391         movq        mm2,        mm7                         ; q1
    392         psubusb     mm5,        mm7                         ; q0-q1
    393 
    394         psubusb     mm7,        mm6                         ; q1-q0
    395         por         mm7,        mm5                         ; abs(q1-q0)
    396 
    397         movq        t1,         mm7                         ; save abs(q1-q0)
    398         psubusb     mm7,        mm4
    399 
    400         por         mm0,        mm7                         ; mask
    401 
    402         movq        mm5,        mm2                         ; q1
    403         psubusb     mm5,        mm1                         ; q1-=p1
    404         psubusb     mm1,        mm2                         ; p1-=q1
    405         por         mm5,        mm1                         ; abs(p1-q1)
    406         pand        mm5,        [tfe GLOBAL]                ; set lsb of each byte to zero
    407         psrlw       mm5,        1                           ; abs(p1-q1)/2
    408 
    409         mov         rdx,        arg(2) ;flimit                      ;
    410 
    411         movq        mm2,        [rdx]                       ;flimit  mm2
    412         movq        mm1,        mm3                         ; mm1=mm3=p0
    413 
    414         movq        mm7,        mm6                         ; mm7=mm6=q0
    415         psubusb     mm1,        mm7                         ; p0-q0
    416 
    417         psubusb     mm7,        mm3                         ; q0-p0
    418         por         mm1,        mm7                         ; abs(q0-p0)
    419         paddusb     mm1,        mm1                         ; abs(q0-p0)*2
    420         paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
    421 
    422         paddb       mm2,        mm2                         ; flimit*2 (less than 255)
    423         paddb       mm4,        mm2                         ; flimit * 2 + limit (less than 255)
    424 
    425         psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
    426         por         mm1,        mm0;                        ; mask
    427 
    428         pxor        mm0,        mm0
    429         pcmpeqb     mm1,        mm0
    430 
    431         ; calculate high edge variance
    432         mov         rdx,        arg(4) ;thresh            ; get thresh
    433         movq        mm7,        [rdx]
    434         ;
    435         movq        mm4,        t0              ; get abs (q1 - q0)
    436         psubusb     mm4,        mm7
    437 
    438         movq        mm3,        t1              ; get abs (p1 - p0)
    439         psubusb     mm3,        mm7
    440 
    441         por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
    442         pcmpeqb     mm4,        mm0
    443 
    444         pcmpeqb     mm0,        mm0
    445         pxor        mm4,        mm0
    446 
    447 
    448 
    449         ; start work on filters
    450         lea         rdx,        srct
    451 
    452         movq        mm2,        [rdx]           ; p1
    453         movq        mm7,        [rdx+24]        ; q1
    454 
    455         movq        mm6,        [rdx+8]         ; p0
    456         movq        mm0,        [rdx+16]        ; q0
    457 
    458         pxor        mm2,        [t80 GLOBAL]    ; p1 offset to convert to signed values
    459         pxor        mm7,        [t80 GLOBAL]    ; q1 offset to convert to signed values
    460 
    461         psubsb      mm2,        mm7             ; p1 - q1
    462         pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
    463 
    464         pxor        mm6,        [t80 GLOBAL]    ; offset to convert to signed values
    465         pxor        mm0,        [t80 GLOBAL]    ; offset to convert to signed values
    466 
    467         movq        mm3,        mm0             ; q0
    468         psubsb      mm0,        mm6             ; q0 - p0
    469 
    470         paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
    471         paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
    472 
    473         paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
    474         pand       mm1,        mm2              ; mask filter values we don't care about
    475 
    476         movq        mm2,        mm1
    477         paddsb      mm1,        [t4 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 4
    478 
    479         paddsb      mm2,        [t3 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 3
    480         pxor        mm0,        mm0          ;
    481 
    482         pxor        mm5,        mm5
    483         punpcklbw   mm0,        mm2         ;
    484 
    485         punpckhbw   mm5,        mm2         ;
    486         psraw       mm0,        11              ;
    487 
    488         psraw       mm5,        11
    489         packsswb    mm0,        mm5
    490 
    491         movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
    492 
    493         pxor        mm0,        mm0           ; 0
    494         movq        mm5,        mm1           ; abcdefgh
    495 
    496         punpcklbw   mm0,        mm1           ; e0f0g0h0
    497         psraw       mm0,        11                ; sign extended shift right by 3
    498 
    499         pxor        mm1,        mm1           ; 0
    500         punpckhbw   mm1,        mm5           ; a0b0c0d0
    501 
    502         psraw       mm1,        11                ; sign extended shift right by 3
    503         movq        mm5,        mm0              ; save results
    504 
    505         packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
    506         paddsw      mm5,        [ones GLOBAL]
    507 
    508         paddsw      mm1,        [ones GLOBAL]
    509         psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
    510 
    511         psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
    512         packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
    513 
    514         pandn       mm4,        mm5             ; high edge variance additive
    515 
    516         paddsb      mm6,        mm2             ; p0+= p0 add
    517         pxor        mm6,        [t80 GLOBAL]    ; unoffset
    518 
    519         ; mm6=p0                               ;
    520         movq        mm1,        [rdx]           ; p1
    521         pxor        mm1,        [t80 GLOBAL]    ; reoffset
    522 
    523         paddsb      mm1,        mm4                 ; p1+= p1 add
    524         pxor        mm1,        [t80 GLOBAL]        ; unoffset
    525         ; mm6 = p0 mm1 = p1
    526 
    527         psubsb      mm3,        mm0                 ; q0-= q0 add
    528         pxor        mm3,        [t80 GLOBAL]        ; unoffset
    529 
    530         ; mm3 = q0
    531         psubsb      mm7,        mm4                 ; q1-= q1 add
    532         pxor        mm7,        [t80 GLOBAL]        ; unoffset
    533         ; mm7 = q1
    534 
    535         ; tranpose and write back
    536         ; mm1 =    72 62 52 42 32 22 12 02
    537         ; mm6 =    73 63 53 43 33 23 13 03
    538         ; mm3 =    74 64 54 44 34 24 14 04
    539         ; mm7 =    75 65 55 45 35 25 15 05
    540 
    541         movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
    542         punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
    543 
    544         movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
    545         punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
    546 
    547         punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
    548         punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
    549 
    550         movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
    551         punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
    552 
    553         punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
    554         movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
    555 
    556         punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
    557         punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
    558 
    559 
    560         ; mm2 = 15 14 13 12 05 04 03 02
    561         ; mm6 = 35 34 33 32 25 24 23 22
    562         ; mm5 = 55 54 53 52 45 44 43 42
    563         ; mm1 = 75 74 73 72 65 64 63 62
    564 
    565 
    566 
    567         movd        [rsi+rax*4+2], mm2
    568         psrlq       mm2,        32
    569 
    570         movd        [rdi+rax*4+2], mm2
    571         movd        [rsi+rax*2+2], mm6
    572 
    573         psrlq       mm6,        32
    574         movd        [rsi+rax+2],mm6
    575 
    576         movd        [rsi+2],    mm1
    577         psrlq       mm1,        32
    578 
    579         movd        [rdi+2],    mm1
    580         neg         rax
    581 
    582         movd        [rdi+rax+2],mm5
    583         psrlq       mm5,        32
    584 
    585         movd        [rdi+rax*2+2], mm5
    586 
    587         lea         rsi,        [rsi+rax*8]
    588         dec         rcx
    589         jnz         next8_v
    590 
    591     add rsp, 64
    592     pop rsp
    593     ; begin epilog
    594     pop rdi
    595     pop rsi
    596     RESTORE_GOT
    597     UNSHADOW_ARGS
    598     pop         rbp
    599     ret
    600 
    601 
    602 ;void vp8_mbloop_filter_horizontal_edge_mmx
    603 ;(
    604 ;    unsigned char *src_ptr,
    605 ;    int  src_pixel_step,
    606 ;    const char *flimit,
    607 ;    const char *limit,
    608 ;    const char *thresh,
    609 ;    int count
    610 ;)
    611 global sym(vp8_mbloop_filter_horizontal_edge_mmx)
    612 sym(vp8_mbloop_filter_horizontal_edge_mmx):
    613     push        rbp
    614     mov         rbp, rsp
    615     SHADOW_ARGS_TO_STACK 6
    616     GET_GOT     rbx
    617     push        rsi
    618     push        rdi
    619     ; end prolog
    620 
    621     ALIGN_STACK 16, rax
    622     sub          rsp, 32      ; reserve 32 bytes
    623     %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
    624     %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
    625 
    626         mov         rsi, arg(0) ;src_ptr
    627         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
    628 
    629         movsxd      rcx, dword ptr arg(5) ;count
    630 next8_mbh:
    631         mov         rdx, arg(3) ;limit
    632         movq        mm7, [rdx]
    633         mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
    634         add         rdi, rax
    635 
    636         ; calculate breakout conditions
    637         movq        mm2, [rdi+2*rax]      ; q3
    638 
    639         movq        mm1, [rsi+2*rax]      ; q2
    640         movq        mm6, mm1              ; q2
    641         psubusb     mm1, mm2              ; q2-=q3
    642         psubusb     mm2, mm6              ; q3-=q2
    643         por         mm1, mm2              ; abs(q3-q2)
    644         psubusb     mm1, mm7
    645 
    646 
    647         ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
    648         movq        mm4, [rsi+rax]        ; q1
    649         movq        mm3, mm4              ; q1
    650         psubusb     mm4, mm6              ; q1-=q2
    651         psubusb     mm6, mm3              ; q2-=q1
    652         por         mm4, mm6              ; abs(q2-q1)
    653         psubusb     mm4, mm7
    654         por        mm1, mm4
    655 
    656 
    657         ; mm1 = mask,      mm3=q1, mm7 = limit
    658 
    659         movq        mm4, [rsi]            ; q0
    660         movq        mm0, mm4              ; q0
    661         psubusb     mm4, mm3              ; q0-=q1
    662         psubusb     mm3, mm0              ; q1-=q0
    663         por         mm4, mm3              ; abs(q0-q1)
    664         movq        t0, mm4               ; save to t0
    665         psubusb     mm4, mm7
    666         por        mm1, mm4
    667 
    668 
    669         ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
    670 
    671         neg         rax                   ; negate pitch to deal with above border
    672 
    673         movq        mm2, [rsi+4*rax]      ; p3
    674         movq        mm4, [rdi+4*rax]      ; p2
    675         movq        mm5, mm4              ; p2
    676         psubusb     mm4, mm2              ; p2-=p3
    677         psubusb     mm2, mm5              ; p3-=p2
    678         por         mm4, mm2              ; abs(p3 - p2)
    679         psubusb     mm4, mm7
    680         por        mm1, mm4
    681         ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
    682 
    683         movq        mm4, [rsi+2*rax]      ; p1
    684         movq        mm3, mm4              ; p1
    685         psubusb     mm4, mm5              ; p1-=p2
    686         psubusb     mm5, mm3              ; p2-=p1
    687         por         mm4, mm5              ; abs(p2 - p1)
    688         psubusb     mm4, mm7
    689         por        mm1, mm4
    690 
    691         movq        mm2, mm3              ; p1
    692 
    693 
    694         ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
    695 
    696         movq        mm4, [rsi+rax]        ; p0
    697         movq        mm5, mm4              ; p0
    698         psubusb     mm4, mm3              ; p0-=p1
    699         psubusb     mm3, mm5              ; p1-=p0
    700         por         mm4, mm3              ; abs(p1 - p0)
    701         movq        t1, mm4               ; save to t1
    702         psubusb     mm4, mm7
    703         por        mm1, mm4
    704         ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
    705         ; mm5 = p0
    706         movq        mm3, [rdi]            ; q1
    707         movq        mm4, mm3              ; q1
    708         psubusb     mm3, mm2              ; q1-=p1
    709         psubusb     mm2, mm4              ; p1-=q1
    710         por         mm2, mm3              ; abs(p1-q1)
    711         pand        mm2, [tfe GLOBAL]     ; set lsb of each byte to zero
    712         psrlw       mm2, 1                ; abs(p1-q1)/2
    713 
    714         movq        mm6, mm5              ; p0
    715         movq        mm3, mm0              ; q0
    716         psubusb     mm5, mm3              ; p0-=q0
    717         psubusb     mm3, mm6              ; q0-=p0
    718         por         mm5, mm3              ; abs(p0 - q0)
    719         paddusb     mm5, mm5              ; abs(p0-q0)*2
    720         paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
    721 
    722         mov         rdx, arg(2) ;flimit           ; get flimit
    723         movq        mm2, [rdx]            ; flimit mm2
    724         paddb       mm2, mm2              ; flimit*2 (less than 255)
    725         paddb       mm7, mm2              ; flimit * 2 + limit (less than 255)
    726 
    727         psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
    728         por         mm1,    mm5
    729         pxor        mm5,    mm5
    730         pcmpeqb     mm1,    mm5           ; mask mm1
    731 
    732         ; mm1 = mask, mm0=q0,  mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
    733         ; mm6 = p0,
    734 
    735         ; calculate high edge variance
    736         mov         rdx, arg(4) ;thresh           ; get thresh
    737         movq        mm7, [rdx]            ;
    738         movq        mm4, t0               ; get abs (q1 - q0)
    739         psubusb     mm4, mm7
    740         movq        mm3, t1               ; get abs (p1 - p0)
    741         psubusb     mm3, mm7
    742         paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
    743 
    744         pcmpeqb     mm4,        mm5
    745 
    746         pcmpeqb     mm5,        mm5
    747         pxor        mm4,        mm5
    748 
    749 
    750 
    751         ; mm1 = mask, mm0=q0,  mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
    752         ; mm6 = p0, mm4=hev
    753         ; start work on filters
    754         movq        mm2, [rsi+2*rax]      ; p1
    755         movq        mm7, [rdi]            ; q1
    756         pxor        mm2, [t80 GLOBAL]     ; p1 offset to convert to signed values
    757         pxor        mm7, [t80 GLOBAL]     ; q1 offset to convert to signed values
    758         psubsb      mm2, mm7              ; p1 - q1
    759 
    760         pxor        mm6, [t80 GLOBAL]     ; offset to convert to signed values
    761         pxor        mm0, [t80 GLOBAL]     ; offset to convert to signed values
    762         movq        mm3, mm0              ; q0
    763         psubsb      mm0, mm6              ; q0 - p0
    764         paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
    765         paddsb      mm2, mm0              ; 2 * (q0 - p0)
    766         paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
    767         pand        mm1, mm2              ; mask filter values we don't care about
    768 
    769 
    770         ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
    771         movq        mm2, mm1              ; vp8_filter
    772         pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
    773 
    774         movq        mm5,        mm2       ;
    775         paddsb      mm5,        [t3 GLOBAL];
    776 
    777         pxor        mm0, mm0              ; 0
    778         pxor        mm7, mm7              ; 0
    779 
    780         punpcklbw   mm0, mm5              ; e0f0g0h0
    781         psraw       mm0, 11               ; sign extended shift right by 3
    782         punpckhbw   mm7, mm5              ; a0b0c0d0
    783         psraw       mm7, 11               ; sign extended shift right by 3
    784         packsswb    mm0, mm7              ; Filter2 >>=3;
    785 
    786         movq        mm5, mm0              ; Filter2
    787 
    788         paddsb      mm2, [t4 GLOBAL]      ; vp8_signed_char_clamp(Filter2 + 4)
    789         pxor        mm0, mm0              ; 0
    790         pxor        mm7, mm7              ; 0
    791 
    792         punpcklbw   mm0, mm2              ; e0f0g0h0
    793         psraw       mm0, 11               ; sign extended shift right by 3
    794         punpckhbw   mm7, mm2              ; a0b0c0d0
    795         psraw       mm7, 11               ; sign extended shift right by 3
    796         packsswb    mm0, mm7              ; Filter2 >>=3;
    797 
    798         ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
    799         psubsb      mm3, mm0              ; qs0 =qs0 - filter1
    800         paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
    801 
    802         ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
    803         ; vp8_filter &= ~hev;
    804         ; Filter2 = vp8_filter;
    805         pandn       mm4, mm1              ; vp8_filter&=~hev
    806 
    807 
    808         ; mm3=qs0, mm4=filter2, mm6=ps0
    809 
    810         ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
    811         ; s = vp8_signed_char_clamp(qs0 - u);
    812         ; *oq0 = s^0x80;
    813         ; s = vp8_signed_char_clamp(ps0 + u);
    814         ; *op0 = s^0x80;
    815         pxor        mm0, mm0
    816 
    817         pxor        mm1, mm1
    818         pxor        mm2, mm2
    819         punpcklbw   mm1, mm4
    820         punpckhbw   mm2, mm4
    821         pmulhw      mm1, [s27 GLOBAL]
    822         pmulhw      mm2, [s27 GLOBAL]
    823         paddw       mm1, [s63 GLOBAL]
    824         paddw       mm2, [s63 GLOBAL]
    825         psraw       mm1, 7
    826         psraw       mm2, 7
    827         packsswb    mm1, mm2
    828 
    829         psubsb      mm3, mm1
    830         paddsb      mm6, mm1
    831 
    832         pxor        mm3, [t80 GLOBAL]
    833         pxor        mm6, [t80 GLOBAL]
    834         movq        [rsi+rax], mm6
    835         movq        [rsi],     mm3
    836 
    837         ; roughly 2/7th difference across boundary
    838         ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
    839         ; s = vp8_signed_char_clamp(qs1 - u);
    840         ; *oq1 = s^0x80;
    841         ; s = vp8_signed_char_clamp(ps1 + u);
    842         ; *op1 = s^0x80;
    843         pxor        mm1, mm1
    844         pxor        mm2, mm2
    845         punpcklbw   mm1, mm4
    846         punpckhbw   mm2, mm4
    847         pmulhw      mm1, [s18 GLOBAL]
    848         pmulhw      mm2, [s18 GLOBAL]
    849         paddw       mm1, [s63 GLOBAL]
    850         paddw       mm2, [s63 GLOBAL]
    851         psraw       mm1, 7
    852         psraw       mm2, 7
    853         packsswb    mm1, mm2
    854 
    855         movq        mm3, [rdi]
    856         movq        mm6, [rsi+rax*2]       ; p1
    857 
    858         pxor        mm3, [t80 GLOBAL]
    859         pxor        mm6, [t80 GLOBAL]
    860 
    861         paddsb      mm6, mm1
    862         psubsb      mm3, mm1
    863 
    864         pxor        mm6, [t80 GLOBAL]
    865         pxor        mm3, [t80 GLOBAL]
    866         movq        [rdi], mm3
    867         movq        [rsi+rax*2], mm6
    868 
    869         ; roughly 1/7th difference across boundary
    870         ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
    871         ; s = vp8_signed_char_clamp(qs2 - u);
    872         ; *oq2 = s^0x80;
    873         ; s = vp8_signed_char_clamp(ps2 + u);
    874         ; *op2 = s^0x80;
    875         pxor        mm1, mm1
    876         pxor        mm2, mm2
    877         punpcklbw   mm1, mm4
    878         punpckhbw   mm2, mm4
    879         pmulhw      mm1, [s9 GLOBAL]
    880         pmulhw      mm2, [s9 GLOBAL]
    881         paddw       mm1, [s63 GLOBAL]
    882         paddw       mm2, [s63 GLOBAL]
    883         psraw       mm1, 7
    884         psraw       mm2, 7
    885         packsswb    mm1, mm2
    886 
    887 
    888         movq        mm6, [rdi+rax*4]
    889         neg         rax
    890         movq        mm3, [rdi+rax  ]
    891 
    892         pxor        mm6, [t80 GLOBAL]
    893         pxor        mm3, [t80 GLOBAL]
    894 
    895         paddsb      mm6, mm1
    896         psubsb      mm3, mm1
    897 
    898         pxor        mm6, [t80 GLOBAL]
    899         pxor        mm3, [t80 GLOBAL]
    900         movq        [rdi+rax  ], mm3
    901         neg         rax
    902         movq        [rdi+rax*4], mm6
    903 
    904 ;EARLY_BREAK_OUT:
    905         neg         rax
    906         add         rsi,8
    907         dec         rcx
    908         jnz         next8_mbh
    909 
    910     add rsp, 32
    911     pop rsp
    912     ; begin epilog
    913     pop rdi
    914     pop rsi
    915     RESTORE_GOT
    916     UNSHADOW_ARGS
    917     pop         rbp
    918     ret
    919 
    920 
    921 ;void vp8_mbloop_filter_vertical_edge_mmx
    922 ;(
    923 ;    unsigned char *src_ptr,
    924 ;    int  src_pixel_step,
    925 ;    const char *flimit,
    926 ;    const char *limit,
    927 ;    const char *thresh,
    928 ;    int count
    929 ;)
    930 global sym(vp8_mbloop_filter_vertical_edge_mmx)
    931 sym(vp8_mbloop_filter_vertical_edge_mmx):
    932     push        rbp
    933     mov         rbp, rsp
    934     SHADOW_ARGS_TO_STACK 6
    935     GET_GOT     rbx
    936     push        rsi
    937     push        rdi
    938     ; end prolog
    939 
    940     ALIGN_STACK 16, rax
    941     sub          rsp, 96      ; reserve 96 bytes
    942     %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
    943     %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
    944     %define srct [rsp + 32]   ;__declspec(align(16)) char srct[64];
    945 
    946         mov         rsi,        arg(0) ;src_ptr
    947         movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
    948 
    949         lea         rsi,        [rsi + rax*4 - 4]
    950 
    951         movsxd      rcx,        dword ptr arg(5) ;count
    952 next8_mbv:
    953         lea         rdi,        [rsi + rax]  ; rdi points to row +1 for indirect addressing
    954 
    955         ;transpose
    956         movq        mm0,        [rdi+2*rax]                 ; 77 76 75 74 73 72 71 70
    957         movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
    958 
    959         movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
    960         punpckhbw   mm7,        mm0                         ; 77 67 76 66 75 65 74 64
    961 
    962         punpcklbw   mm6,        mm0                         ; 73 63 72 62 71 61 70 60
    963         movq        mm0,        [rsi+rax]                   ; 57 56 55 54 53 52 51 50
    964 
    965         movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
    966         movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
    967 
    968         punpckhbw   mm5,        mm0                         ; 57 47 56 46 55 45 54 44
    969         punpcklbw   mm4,        mm0                         ; 53 43 52 42 51 41 50 40
    970 
    971         movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
    972         punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
    973 
    974         punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
    975         movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
    976 
    977         punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
    978         punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
    979 
    980         neg         rax
    981 
    982         movq        mm7,        [rsi+rax]                   ; 37 36 35 34 33 32 31 30
    983         movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
    984 
    985         movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
    986         punpckhbw   mm6,        mm7                         ; 37 27 36 36 35 25 34 24
    987 
    988         punpcklbw   mm1,        mm7                         ; 33 23 32 22 31 21 30 20
    989 
    990         movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
    991         punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
    992 
    993         movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
    994         punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
    995 
    996         punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
    997         movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
    998 
    999         punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
   1000         punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
   1001 
   1002         lea         rdx,        srct
   1003         movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
   1004 
   1005         movq        [rdx+56],   mm7
   1006         psubusb     mm5,        mm7                         ; q2-q3
   1007 
   1008 
   1009         movq        [rdx+48],   mm6
   1010         psubusb     mm7,        mm6                         ; q3-q2
   1011 
   1012         por         mm7,        mm5;                        ; mm7=abs (q3-q2)
   1013         movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
   1014 
   1015         punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
   1016         punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
   1017 
   1018         movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
   1019         psubusb     mm3,        mm6                         ; q1-q2
   1020 
   1021         psubusb     mm6,        mm5                         ; q2-q1
   1022         por         mm6,        mm3                         ; mm6=abs(q2-q1)
   1023 
   1024         movq        [rdx+40],   mm5                         ; save q1
   1025         movq        [rdx+32],   mm0                         ; save q0
   1026 
   1027         movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
   1028         punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
   1029 
   1030         movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
   1031         punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
   1032 
   1033         punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
   1034         movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
   1035 
   1036         punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
   1037         punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
   1038 
   1039         movq        [rdx],      mm0                         ; save p3
   1040         movq        [rdx+8],    mm1                         ; save p2
   1041 
   1042         movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
   1043         psubusb     mm2,        mm0                         ; p2-p3
   1044 
   1045         psubusb     mm0,        mm1                         ; p3-p2
   1046         por         mm0,        mm2                         ; mm0=abs(p3-p2)
   1047 
   1048         movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
   1049         punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
   1050 
   1051         punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
   1052         movq        [rdx+24],   mm3                         ; save p0
   1053 
   1054         movq        [rdx+16],   mm2                         ; save p1
   1055         movq        mm5,        mm2                         ; mm5 = p1
   1056 
   1057         psubusb     mm2,        mm1                         ; p1-p2
   1058         psubusb     mm1,        mm5                         ; p2-p1
   1059 
   1060         por         mm1,        mm2                         ; mm1=abs(p2-p1)
   1061         mov         rdx,        arg(3) ;limit
   1062 
   1063         movq        mm4,        [rdx]                       ; mm4 = limit
   1064         psubusb     mm7,        mm4                         ; abs(q3-q2) > limit
   1065 
   1066         psubusb     mm0,        mm4                         ; abs(p3-p2) > limit
   1067         psubusb     mm1,        mm4                         ; abs(p2-p1) > limit
   1068 
   1069         psubusb     mm6,        mm4                         ; abs(q2-q1) > limit
   1070         por         mm7,        mm6                         ; or
   1071 
   1072         por         mm0,        mm1                         ;
   1073         por         mm0,        mm7                         ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
   1074 
   1075         movq        mm1,        mm5                         ; p1
   1076 
   1077         movq        mm7,        mm3                         ; mm3=mm7=p0
   1078         psubusb     mm7,        mm5                         ; p0 - p1
   1079 
   1080         psubusb     mm5,        mm3                         ; p1 - p0
   1081         por         mm5,        mm7                         ; abs(p1-p0)
   1082 
   1083         movq        t0,         mm5                         ; save abs(p1-p0)
   1084         lea         rdx,        srct
   1085 
   1086         psubusb     mm5,        mm4                         ; mm5 = abs(p1-p0) > limit
   1087         por         mm0,        mm5                         ; mm0=mask
   1088 
   1089         movq        mm5,        [rdx+32]                    ; mm5=q0
   1090         movq        mm7,        [rdx+40]                    ; mm7=q1
   1091 
   1092         movq        mm6,        mm5                         ; mm6=q0
   1093         movq        mm2,        mm7                         ; q1
   1094         psubusb     mm5,        mm7                         ; q0-q1
   1095 
   1096         psubusb     mm7,        mm6                         ; q1-q0
   1097         por         mm7,        mm5                         ; abs(q1-q0)
   1098 
   1099         movq        t1,         mm7                         ; save abs(q1-q0)
   1100         psubusb     mm7,        mm4                         ; mm7=abs(q1-q0)> limit
   1101 
   1102         por         mm0,        mm7                         ; mask
   1103 
   1104         movq        mm5,        mm2                         ; q1
   1105         psubusb     mm5,        mm1                         ; q1-=p1
   1106         psubusb     mm1,        mm2                         ; p1-=q1
   1107         por         mm5,        mm1                         ; abs(p1-q1)
   1108         pand        mm5,        [tfe GLOBAL]                ; set lsb of each byte to zero
   1109         psrlw       mm5,        1                           ; abs(p1-q1)/2
   1110 
   1111         mov         rdx,        arg(2) ;flimit                      ;
   1112 
   1113         movq        mm2,        [rdx]                       ;flimit  mm2
   1114         movq        mm1,        mm3                         ; mm1=mm3=p0
   1115 
   1116         movq        mm7,        mm6                         ; mm7=mm6=q0
   1117         psubusb     mm1,        mm7                         ; p0-q0
   1118 
   1119         psubusb     mm7,        mm3                         ; q0-p0
   1120         por         mm1,        mm7                         ; abs(q0-p0)
   1121         paddusb     mm1,        mm1                         ; abs(q0-p0)*2
   1122         paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
   1123 
   1124         paddb       mm2,        mm2                         ; flimit*2 (less than 255)
   1125         paddb       mm4,        mm2                         ; flimit * 2 + limit (less than 255)
   1126 
   1127         psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
   1128         por         mm1,        mm0;                        ; mask
   1129 
   1130         pxor        mm0,        mm0
   1131         pcmpeqb     mm1,        mm0
   1132 
   1133         ; calculate high edge variance
   1134         mov         rdx,        arg(4) ;thresh            ; get thresh
   1135         movq        mm7,        [rdx]
   1136         ;
   1137         movq        mm4,        t0              ; get abs (q1 - q0)
   1138         psubusb     mm4,        mm7             ; abs(q1 - q0) > thresh
   1139 
   1140         movq        mm3,        t1              ; get abs (p1 - p0)
   1141         psubusb     mm3,        mm7             ; abs(p1 - p0)> thresh
   1142 
   1143         por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
   1144         pcmpeqb     mm4,        mm0
   1145 
   1146         pcmpeqb     mm0,        mm0
   1147         pxor        mm4,        mm0
   1148 
   1149 
   1150 
   1151 
   1152         ; start work on filters
   1153         lea         rdx,        srct
   1154 
   1155         ; start work on filters
   1156         movq        mm2, [rdx+16]         ; p1
   1157         movq        mm7, [rdx+40]         ; q1
   1158         pxor        mm2, [t80 GLOBAL]     ; p1 offset to convert to signed values
   1159         pxor        mm7, [t80 GLOBAL]     ; q1 offset to convert to signed values
   1160         psubsb      mm2, mm7              ; p1 - q1
   1161 
   1162         movq        mm6, [rdx+24]         ; p0
   1163         movq        mm0, [rdx+32]         ; q0
   1164         pxor        mm6, [t80 GLOBAL]     ; offset to convert to signed values
   1165         pxor        mm0, [t80 GLOBAL]     ; offset to convert to signed values
   1166 
   1167         movq        mm3, mm0              ; q0
   1168         psubsb      mm0, mm6              ; q0 - p0
   1169         paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
   1170         paddsb      mm2, mm0              ; 2 * (q0 - p0)
   1171         paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
   1172         pand       mm1, mm2           ; mask filter values we don't care about
   1173 
   1174         ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
   1175         movq        mm2, mm1              ; vp8_filter
   1176         pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
   1177 
   1178         movq        mm5,        mm2       ;
   1179         paddsb      mm5,        [t3 GLOBAL];
   1180 
   1181         pxor        mm0, mm0              ; 0
   1182         pxor        mm7, mm7              ; 0
   1183 
   1184         punpcklbw   mm0, mm5              ; e0f0g0h0
   1185         psraw       mm0, 11               ; sign extended shift right by 3
   1186         punpckhbw   mm7, mm5              ; a0b0c0d0
   1187         psraw       mm7, 11               ; sign extended shift right by 3
   1188         packsswb    mm0, mm7              ; Filter2 >>=3;
   1189 
   1190         movq        mm5, mm0              ; Filter2
   1191 
   1192         paddsb      mm2, [t4 GLOBAL]      ; vp8_signed_char_clamp(Filter2 + 4)
   1193         pxor        mm0, mm0              ; 0
   1194         pxor        mm7, mm7              ; 0
   1195 
   1196         punpcklbw   mm0, mm2              ; e0f0g0h0
   1197         psraw       mm0, 11               ; sign extended shift right by 3
   1198         punpckhbw   mm7, mm2              ; a0b0c0d0
   1199         psraw       mm7, 11               ; sign extended shift right by 3
   1200         packsswb    mm0, mm7              ; Filter2 >>=3;
   1201 
   1202         ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
   1203         psubsb      mm3, mm0              ; qs0 =qs0 - filter1
   1204         paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
   1205 
   1206         ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
   1207         ; vp8_filter &= ~hev;
   1208         ; Filter2 = vp8_filter;
   1209         pandn       mm4, mm1              ; vp8_filter&=~hev
   1210 
   1211 
   1212         ; mm3=qs0, mm4=filter2, mm6=ps0
   1213 
   1214         ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
   1215         ; s = vp8_signed_char_clamp(qs0 - u);
   1216         ; *oq0 = s^0x80;
   1217         ; s = vp8_signed_char_clamp(ps0 + u);
   1218         ; *op0 = s^0x80;
   1219         pxor        mm0, mm0
   1220 
   1221         pxor        mm1, mm1
   1222         pxor        mm2, mm2
   1223         punpcklbw   mm1, mm4
   1224         punpckhbw   mm2, mm4
   1225         pmulhw      mm1, [s27 GLOBAL]
   1226         pmulhw      mm2, [s27 GLOBAL]
   1227         paddw       mm1, [s63 GLOBAL]
   1228         paddw       mm2, [s63 GLOBAL]
   1229         psraw       mm1, 7
   1230         psraw       mm2, 7
   1231         packsswb    mm1, mm2
   1232 
   1233         psubsb      mm3, mm1
   1234         paddsb      mm6, mm1
   1235 
   1236         pxor        mm3, [t80 GLOBAL]
   1237         pxor        mm6, [t80 GLOBAL]
   1238         movq        [rdx+24], mm6
   1239         movq        [rdx+32], mm3
   1240 
   1241         ; roughly 2/7th difference across boundary
   1242         ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
   1243         ; s = vp8_signed_char_clamp(qs1 - u);
   1244         ; *oq1 = s^0x80;
   1245         ; s = vp8_signed_char_clamp(ps1 + u);
   1246         ; *op1 = s^0x80;
   1247         pxor        mm1, mm1
   1248         pxor        mm2, mm2
   1249         punpcklbw   mm1, mm4
   1250         punpckhbw   mm2, mm4
   1251         pmulhw      mm1, [s18 GLOBAL]
   1252         pmulhw      mm2, [s18 GLOBAL]
   1253         paddw       mm1, [s63 GLOBAL]
   1254         paddw       mm2, [s63 GLOBAL]
   1255         psraw       mm1, 7
   1256         psraw       mm2, 7
   1257         packsswb    mm1, mm2
   1258 
   1259         movq        mm3, [rdx + 40]
   1260         movq        mm6, [rdx + 16]       ; p1
   1261         pxor        mm3, [t80 GLOBAL]
   1262         pxor        mm6, [t80 GLOBAL]
   1263 
   1264         paddsb      mm6, mm1
   1265         psubsb      mm3, mm1
   1266 
   1267         pxor        mm6, [t80 GLOBAL]
   1268         pxor        mm3, [t80 GLOBAL]
   1269         movq        [rdx + 40], mm3
   1270         movq        [rdx + 16], mm6
   1271 
   1272         ; roughly 1/7th difference across boundary
   1273         ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
   1274         ; s = vp8_signed_char_clamp(qs2 - u);
   1275         ; *oq2 = s^0x80;
   1276         ; s = vp8_signed_char_clamp(ps2 + u);
   1277         ; *op2 = s^0x80;
   1278         pxor        mm1, mm1
   1279         pxor        mm2, mm2
   1280         punpcklbw   mm1, mm4
   1281         punpckhbw   mm2, mm4
   1282         pmulhw      mm1, [s9 GLOBAL]
   1283         pmulhw      mm2, [s9 GLOBAL]
   1284         paddw       mm1, [s63 GLOBAL]
   1285         paddw       mm2, [s63 GLOBAL]
   1286         psraw       mm1, 7
   1287         psraw       mm2, 7
   1288         packsswb    mm1, mm2
   1289 
   1290         movq        mm6, [rdx+ 8]
   1291         movq        mm3, [rdx+48]
   1292 
   1293         pxor        mm6, [t80 GLOBAL]
   1294         pxor        mm3, [t80 GLOBAL]
   1295 
   1296         paddsb      mm6, mm1
   1297         psubsb      mm3, mm1
   1298 
   1299         pxor        mm6, [t80 GLOBAL]           ; mm6 = 71 61 51 41 31 21 11 01
   1300         pxor        mm3, [t80 GLOBAL]           ; mm3 = 76 66 56 46 36 26 15 06
   1301 
   1302         ; tranpose and write back
   1303         movq        mm0,    [rdx]               ; mm0 = 70 60 50 40 30 20 10 00
   1304         movq        mm1,    mm0                 ; mm0 = 70 60 50 40 30 20 10 00
   1305 
   1306         punpcklbw   mm0,    mm6                 ; mm0 = 31 30 21 20 11 10 01 00
   1307         punpckhbw   mm1,    mm6                 ; mm3 = 71 70 61 60 51 50 41 40
   1308 
   1309         movq        mm2,    [rdx+16]            ; mm2 = 72 62 52 42 32 22 12 02
   1310         movq        mm6,    mm2                 ; mm3 = 72 62 52 42 32 22 12 02
   1311 
   1312         punpcklbw   mm2,    [rdx+24]            ; mm2 = 33 32 23 22 13 12 03 02
   1313         punpckhbw   mm6,    [rdx+24]            ; mm3 = 73 72 63 62 53 52 43 42
   1314 
   1315         movq        mm5,    mm0                 ; mm5 = 31 30 21 20 11 10 01 00
   1316         punpcklwd   mm0,    mm2                 ; mm0 = 13 12 11 10 03 02 01 00
   1317 
   1318         punpckhwd   mm5,    mm2                 ; mm5 = 33 32 31 30 23 22 21 20
   1319         movq        mm4,    mm1                 ; mm4 = 71 70 61 60 51 50 41 40
   1320 
   1321         punpcklwd   mm1,    mm6                 ; mm1 = 53 52 51 50 43 42 41 40
   1322         punpckhwd   mm4,    mm6                 ; mm4 = 73 72 71 70 63 62 61 60
   1323 
   1324         movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
   1325         punpcklbw   mm2,    [rdx+40]            ; mm2 = 35 34 25 24 15 14 05 04
   1326 
   1327         movq        mm6,    mm3                 ; mm6 = 76 66 56 46 36 26 15 06
   1328         punpcklbw   mm6,    [rdx+56]            ; mm6 = 37 36 27 26 17 16 07 06
   1329 
   1330         movq        mm7,    mm2                 ; mm7 = 35 34 25 24 15 14 05 04
   1331         punpcklwd   mm2,    mm6                 ; mm2 = 17 16 15 14 07 06 05 04
   1332 
   1333         punpckhwd   mm7,    mm6                 ; mm7 = 37 36 35 34 27 26 25 24
   1334         movq        mm6,    mm0                 ; mm6 = 13 12 11 10 03 02 01 00
   1335 
   1336         punpckldq   mm0,    mm2                 ; mm0 = 07 06 05 04 03 02 01 00
   1337         punpckhdq   mm6,    mm2                 ; mm6 = 17 16 15 14 13 12 11 10
   1338 
   1339         movq        [rsi+rax*4], mm0            ; write out
   1340         movq        [rdi+rax*4], mm6            ; write out
   1341 
   1342         movq        mm0,    mm5                 ; mm0 = 33 32 31 30 23 22 21 20
   1343         punpckldq   mm0,    mm7                 ; mm0 = 27 26 25 24 23 22 20 20
   1344 
   1345         punpckhdq   mm5,    mm7                 ; mm5 = 37 36 35 34 33 32 31 30
   1346         movq        [rsi+rax*2], mm0            ; write out
   1347 
   1348         movq        [rdi+rax*2], mm5            ; write out
   1349         movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
   1350 
   1351         punpckhbw   mm2,    [rdx+40]            ; mm2 = 75 74 65 64 54 54 45 44
   1352         punpckhbw   mm3,    [rdx+56]            ; mm3 = 77 76 67 66 57 56 47 46
   1353 
   1354         movq        mm5,    mm2                 ; mm5 = 75 74 65 64 54 54 45 44
   1355         punpcklwd   mm2,    mm3                 ; mm2 = 57 56 55 54 47 46 45 44
   1356 
   1357         punpckhwd   mm5,    mm3                 ; mm5 = 77 76 75 74 67 66 65 64
   1358         movq        mm0,    mm1                 ; mm0=  53 52 51 50 43 42 41 40
   1359 
   1360         movq        mm3,    mm4                 ; mm4 = 73 72 71 70 63 62 61 60
   1361         punpckldq   mm0,    mm2                 ; mm0 = 47 46 45 44 43 42 41 40
   1362 
   1363         punpckhdq   mm1,    mm2                 ; mm1 = 57 56 55 54 53 52 51 50
   1364         movq        [rsi],  mm0                 ; write out
   1365 
   1366         movq        [rdi],  mm1                 ; write out
   1367         neg         rax
   1368 
   1369         punpckldq   mm3,    mm5                 ; mm3 = 67 66 65 64 63 62 61 60
   1370         punpckhdq   mm4,    mm5                 ; mm4 = 77 76 75 74 73 72 71 60
   1371 
   1372         movq        [rsi+rax*2], mm3
   1373         movq        [rdi+rax*2], mm4
   1374 
   1375         lea         rsi,        [rsi+rax*8]
   1376         dec         rcx
   1377 
   1378         jnz         next8_mbv
   1379 
   1380     add rsp, 96
   1381     pop rsp
   1382     ; begin epilog
   1383     pop rdi
   1384     pop rsi
   1385     RESTORE_GOT
   1386     UNSHADOW_ARGS
   1387     pop         rbp
   1388     ret
   1389 
   1390 
   1391 ;void vp8_loop_filter_simple_horizontal_edge_mmx
   1392 ;(
   1393 ;    unsigned char *src_ptr,
   1394 ;    int  src_pixel_step,
   1395 ;    const char *flimit,
   1396 ;    const char *limit,
   1397 ;    const char *thresh,
   1398 ;    int count
   1399 ;)
   1400 global sym(vp8_loop_filter_simple_horizontal_edge_mmx)
   1401 sym(vp8_loop_filter_simple_horizontal_edge_mmx):
   1402     push        rbp
   1403     mov         rbp, rsp
   1404     SHADOW_ARGS_TO_STACK 6
   1405     GET_GOT     rbx
   1406     push        rsi
   1407     push        rdi
   1408     ; end prolog
   1409 
   1410         mov         rsi, arg(0) ;src_ptr
   1411         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
   1412 
   1413         movsxd      rcx, dword ptr arg(5) ;count
   1414 nexts8_h:
   1415         mov         rdx, arg(3) ;limit
   1416         movq        mm7, [rdx]
   1417         mov         rdx, arg(2) ;flimit           ; get flimit
   1418         movq        mm3, [rdx]            ;
   1419         paddb       mm3, mm3              ; flimit*2 (less than 255)
   1420         paddb       mm3, mm7              ; flimit * 2 + limit (less than 255)
   1421 
   1422         mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
   1423         add         rdi, rax
   1424         neg         rax
   1425 
   1426         ; calculate mask
   1427         movq        mm1, [rsi+2*rax]      ; p1
   1428         movq        mm0, [rdi]            ; q1
   1429         movq        mm2, mm1
   1430         movq        mm7, mm0
   1431         movq        mm4, mm0
   1432         psubusb     mm0, mm1              ; q1-=p1
   1433         psubusb     mm1, mm4              ; p1-=q1
   1434         por         mm1, mm0              ; abs(p1-q1)
   1435         pand        mm1, [tfe GLOBAL]     ; set lsb of each byte to zero
   1436         psrlw       mm1, 1                ; abs(p1-q1)/2
   1437 
   1438         movq        mm5, [rsi+rax]        ; p0
   1439         movq        mm4, [rsi]            ; q0
   1440         movq        mm0, mm4              ; q0
   1441         movq        mm6, mm5              ; p0
   1442         psubusb     mm5, mm4              ; p0-=q0
   1443         psubusb     mm4, mm6              ; q0-=p0
   1444         por         mm5, mm4              ; abs(p0 - q0)
   1445         paddusb     mm5, mm5              ; abs(p0-q0)*2
   1446         paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
   1447 
   1448         psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
   1449         pxor        mm3, mm3
   1450         pcmpeqb     mm5, mm3
   1451 
   1452         ; start work on filters
   1453         pxor        mm2, [t80 GLOBAL]     ; p1 offset to convert to signed values
   1454         pxor        mm7, [t80 GLOBAL]     ; q1 offset to convert to signed values
   1455         psubsb      mm2, mm7              ; p1 - q1
   1456 
   1457         pxor        mm6, [t80 GLOBAL]     ; offset to convert to signed values
   1458         pxor        mm0, [t80 GLOBAL]     ; offset to convert to signed values
   1459         movq        mm3, mm0              ; q0
   1460         psubsb      mm0, mm6              ; q0 - p0
   1461         paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)
   1462         paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)
   1463         paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)
   1464         pand        mm5, mm2              ; mask filter values we don't care about
   1465 
   1466         ; do + 4 side
   1467         paddsb      mm5, [t4 GLOBAL]      ; 3* (q0 - p0) + (p1 - q1) + 4
   1468 
   1469         movq        mm0, mm5              ; get a copy of filters
   1470         psllw       mm0, 8                ; shift left 8
   1471         psraw       mm0, 3                ; arithmetic shift right 11
   1472         psrlw       mm0, 8
   1473         movq        mm1, mm5              ; get a copy of filters
   1474         psraw       mm1, 11               ; arithmetic shift right 11
   1475         psllw       mm1, 8                ; shift left 8 to put it back
   1476 
   1477         por         mm0, mm1              ; put the two together to get result
   1478 
   1479         psubsb      mm3, mm0              ; q0-= q0 add
   1480         pxor        mm3, [t80 GLOBAL]     ; unoffset
   1481         movq        [rsi], mm3            ; write back
   1482 
   1483 
   1484         ; now do +3 side
   1485         psubsb      mm5, [t1s GLOBAL]      ; +3 instead of +4
   1486 
   1487         movq        mm0, mm5              ; get a copy of filters
   1488         psllw       mm0, 8                ; shift left 8
   1489         psraw       mm0, 3                ; arithmetic shift right 11
   1490         psrlw       mm0, 8
   1491         psraw       mm5, 11               ; arithmetic shift right 11
   1492         psllw       mm5, 8                ; shift left 8 to put it back
   1493         por         mm0, mm5              ; put the two together to get result
   1494 
   1495 
   1496         paddsb      mm6, mm0              ; p0+= p0 add
   1497         pxor        mm6, [t80 GLOBAL]     ; unoffset
   1498         movq        [rsi+rax], mm6        ; write back
   1499 
   1500         add         rsi,8
   1501         neg         rax
   1502         dec         rcx
   1503         jnz         nexts8_h
   1504 
   1505     ; begin epilog
   1506     pop rdi
   1507     pop rsi
   1508     RESTORE_GOT
   1509     UNSHADOW_ARGS
   1510     pop         rbp
   1511     ret
   1512 
   1513 
   1514 ;void vp8_loop_filter_simple_vertical_edge_mmx
   1515 ;(
   1516 ;    unsigned char *src_ptr,
   1517 ;    int  src_pixel_step,
   1518 ;    const char *flimit,
   1519 ;    const char *limit,
   1520 ;    const char *thresh,
   1521 ;    int count
   1522 ;)
   1523 global sym(vp8_loop_filter_simple_vertical_edge_mmx)
   1524 sym(vp8_loop_filter_simple_vertical_edge_mmx):
   1525     push        rbp
   1526     mov         rbp, rsp
   1527     SHADOW_ARGS_TO_STACK 6
   1528     GET_GOT     rbx
   1529     push        rsi
   1530     push        rdi
   1531     ; end prolog
   1532 
   1533     ALIGN_STACK 16, rax
   1534     sub          rsp, 32      ; reserve 32 bytes
   1535     %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
   1536     %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
   1537 
   1538         mov         rsi, arg(0) ;src_ptr
   1539         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
   1540 
   1541         lea         rsi, [rsi + rax*4- 2];  ;
   1542         movsxd      rcx, dword ptr arg(5) ;count
   1543 nexts8_v:
   1544 
   1545         lea         rdi,        [rsi + rax];
   1546         movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
   1547 
   1548         movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60
   1549         punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60
   1550 
   1551         movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50
   1552         movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40
   1553 
   1554         punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40
   1555         movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40
   1556 
   1557         punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40
   1558         punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42
   1559 
   1560         neg         rax
   1561 
   1562         movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30
   1563         movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20
   1564 
   1565         punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20
   1566         movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10
   1567 
   1568         movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00
   1569         punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00
   1570 
   1571         movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00
   1572         punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00
   1573 
   1574         punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02
   1575         movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00
   1576 
   1577         punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1
   1578         movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02
   1579 
   1580         punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0
   1581         punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0
   1582 
   1583         punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1
   1584 
   1585 
   1586         ; calculate mask
   1587         movq        mm6,        mm0                             ; p1
   1588         movq        mm7,        mm3                             ; q1
   1589         psubusb     mm7,        mm6                             ; q1-=p1
   1590         psubusb     mm6,        mm3                             ; p1-=q1
   1591         por         mm6,        mm7                             ; abs(p1-q1)
   1592         pand        mm6,        [tfe GLOBAL]                    ; set lsb of each byte to zero
   1593         psrlw       mm6,        1                               ; abs(p1-q1)/2
   1594 
   1595         movq        mm5,        mm1                             ; p0
   1596         movq        mm4,        mm2                             ; q0
   1597 
   1598         psubusb     mm5,        mm2                             ; p0-=q0
   1599         psubusb     mm4,        mm1                             ; q0-=p0
   1600 
   1601         por         mm5,        mm4                             ; abs(p0 - q0)
   1602         paddusb     mm5,        mm5                             ; abs(p0-q0)*2
   1603         paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
   1604 
   1605         mov         rdx,        arg(2) ;flimit                          ; get flimit
   1606         movq        mm7,        [rdx]
   1607         mov         rdx,        arg(3)                          ; get limit
   1608         movq        mm6,        [rdx]
   1609         paddb       mm7,        mm7                             ; flimit*2 (less than 255)
   1610         paddb       mm7,        mm6                             ; flimit * 2 + limit (less than 255)
   1611 
   1612         psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
   1613         pxor        mm7,        mm7
   1614         pcmpeqb     mm5,        mm7                             ; mm5 = mask
   1615 
   1616         ; start work on filters
   1617         movq        t0,         mm0
   1618         movq        t1,         mm3
   1619 
   1620         pxor        mm0,        [t80 GLOBAL]                    ; p1 offset to convert to signed values
   1621         pxor        mm3,        [t80 GLOBAL]                    ; q1 offset to convert to signed values
   1622 
   1623         psubsb      mm0,        mm3                             ; p1 - q1
   1624         movq        mm6,        mm1                             ; p0
   1625 
   1626         movq        mm7,        mm2                             ; q0
   1627         pxor        mm6,        [t80 GLOBAL]                    ; offset to convert to signed values
   1628 
   1629         pxor        mm7,        [t80 GLOBAL]                    ; offset to convert to signed values
   1630         movq        mm3,        mm7                             ; offseted ; q0
   1631 
   1632         psubsb      mm7,        mm6                             ; q0 - p0
   1633         paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)
   1634 
   1635         paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)
   1636         paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)
   1637 
   1638         pand        mm5,        mm0                             ; mask filter values we don't care about
   1639 
   1640         paddsb      mm5,        [t4 GLOBAL]                     ;  3* (q0 - p0) + (p1 - q1) + 4
   1641 
   1642         movq        mm0,        mm5                             ; get a copy of filters
   1643         psllw       mm0,        8                               ; shift left 8
   1644         psraw       mm0,        3                               ; arithmetic shift right 11
   1645         psrlw       mm0,        8
   1646 
   1647         movq        mm7,        mm5                             ; get a copy of filters
   1648         psraw       mm7,        11                              ; arithmetic shift right 11
   1649         psllw       mm7,        8                               ; shift left 8 to put it back
   1650 
   1651         por         mm0,        mm7                             ; put the two together to get result
   1652 
   1653         psubsb      mm3,        mm0                             ; q0-= q0sz add
   1654         pxor        mm3,        [t80 GLOBAL]                    ; unoffset
   1655 
   1656         ; now do +3 side
   1657         psubsb      mm5, [t1s GLOBAL]                           ; +3 instead of +4
   1658 
   1659         movq        mm0, mm5                                    ; get a copy of filters
   1660         psllw       mm0, 8                                      ; shift left 8
   1661         psraw       mm0, 3                                      ; arithmetic shift right 11
   1662         psrlw       mm0, 8
   1663 
   1664         psraw       mm5, 11                                     ; arithmetic shift right 11
   1665         psllw       mm5, 8                                      ; shift left 8 to put it back
   1666         por         mm0, mm5                                    ; put the two together to get result
   1667 
   1668         paddsb      mm6, mm0                                    ; p0+= p0 add
   1669         pxor        mm6, [t80 GLOBAL]                           ; unoffset
   1670 
   1671 
   1672         movq        mm0,        t0
   1673         movq        mm4,        t1
   1674 
   1675         ; mm0 = 70 60 50 40 30 20 10 00
   1676         ; mm6 = 71 61 51 41 31 21 11 01
   1677         ; mm3 = 72 62 52 42 32 22 12 02
   1678         ; mm4 = 73 63 53 43 33 23 13 03
   1679         ; transpose back to write out
   1680 
   1681         movq        mm1,        mm0                         ;
   1682         punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00
   1683 
   1684         punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40
   1685         movq        mm2,        mm3                         ;
   1686 
   1687         punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02
   1688         movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40
   1689 
   1690         punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42
   1691         movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00
   1692 
   1693         punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00
   1694         punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20
   1695 
   1696         movd        [rsi+rax*4], mm0                        ; write 03 02 01 00
   1697         punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40
   1698 
   1699         psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10
   1700         punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60
   1701 
   1702         movd        [rdi+rax*4], mm0                        ; write 13 12 11 10
   1703         movd        [rsi+rax*2], mm6                        ; write 23 22 21 20
   1704 
   1705         psrlq       mm6,        32                          ; 33 32 31 30
   1706         movd        [rsi],      mm1                         ; write 43 42 41 40
   1707 
   1708         movd        [rsi + rax], mm6                        ; write 33 32 31 30
   1709         neg         rax
   1710 
   1711         movd        [rsi + rax*2], mm5                      ; write 63 62 61 60
   1712         psrlq       mm1,        32                          ; 53 52 51 50
   1713 
   1714         movd        [rdi],      mm1                         ; write out 53 52 51 50
   1715         psrlq       mm5,        32                          ; 73 72 71 70
   1716 
   1717         movd        [rdi + rax*2], mm5                      ; write 73 72 71 70
   1718 
   1719         lea         rsi,        [rsi+rax*8]                 ; next 8
   1720 
   1721         dec         rcx
   1722         jnz         nexts8_v
   1723 
   1724     add rsp, 32
   1725     pop rsp
   1726     ; begin epilog
   1727     pop rdi
   1728     pop rsi
   1729     RESTORE_GOT
   1730     UNSHADOW_ARGS
   1731     pop         rbp
   1732     ret
   1733 
   1734 
   1735 
   1736 ;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
   1737 ;                  int y_stride,
   1738 ;                  loop_filter_info *lfi)
   1739 ;{
   1740 ;
   1741 ;
   1742 ;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
   1743 ;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
   1744 ;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
   1745 ;}
   1746 
   1747 SECTION_RODATA
   1748 align 16
   1749 tfe:
   1750     times 8 db 0xfe
   1751 align 16
   1752 t80:
   1753     times 8 db 0x80
   1754 align 16
   1755 t1s:
   1756     times 8 db 0x01
   1757 align 16
   1758 t3:
   1759     times 8 db 0x03
   1760 align 16
   1761 t4:
   1762     times 8 db 0x04
   1763 align 16
   1764 ones:
   1765     times 4 dw 0x0001
   1766 align 16
   1767 s27:
   1768     times 4 dw 0x1b00
   1769 align 16
   1770 s18:
   1771     times 4 dw 0x1200
   1772 align 16
   1773 s9:
   1774     times 4 dw 0x0900
   1775 align 16
   1776 s63:
   1777     times 4 dw 0x003f
   1778