Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 
     15 ;void vp9_lpf_horizontal_4_mmx
     16 ;(
     17 ;    unsigned char *src_ptr,
     18 ;    int src_pixel_step,
     19 ;    const char *blimit,
     20 ;    const char *limit,
     21 ;    const char *thresh,
     22 ;    int  count
     23 ;)
     24 global sym(vp9_lpf_horizontal_4_mmx) PRIVATE
     25 sym(vp9_lpf_horizontal_4_mmx):
     26     push        rbp
     27     mov         rbp, rsp
     28     SHADOW_ARGS_TO_STACK 6
     29     GET_GOT     rbx
     30     push        rsi
     31     push        rdi
     32     ; end prolog
     33 
     34     ALIGN_STACK 16, rax
     35     sub         rsp, 32                         ; reserve 32 bytes
     36     %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
     37     %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
     38 
     39         mov         rsi, arg(0) ;src_ptr
     40         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
     41 
     42         movsxd      rcx, dword ptr arg(5) ;count
     43 .next8_h:
     44         mov         rdx, arg(3) ;limit
     45         movq        mm7, [rdx]
     46         mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
     47         add         rdi, rax
     48 
     49         ; calculate breakout conditions
     50         movq        mm2, [rdi+2*rax]      ; q3
     51         movq        mm1, [rsi+2*rax]      ; q2
     52         movq        mm6, mm1              ; q2
     53         psubusb     mm1, mm2              ; q2-=q3
     54         psubusb     mm2, mm6              ; q3-=q2
     55         por         mm1, mm2              ; abs(q3-q2)
     56         psubusb     mm1, mm7              ;
     57 
     58 
     59         movq        mm4, [rsi+rax]        ; q1
     60         movq        mm3, mm4              ; q1
     61         psubusb     mm4, mm6              ; q1-=q2
     62         psubusb     mm6, mm3              ; q2-=q1
     63         por         mm4, mm6              ; abs(q2-q1)
     64 
     65         psubusb     mm4, mm7
     66         por        mm1, mm4
     67 
     68         movq        mm4, [rsi]            ; q0
     69         movq        mm0, mm4              ; q0
     70         psubusb     mm4, mm3              ; q0-=q1
     71         psubusb     mm3, mm0              ; q1-=q0
     72         por         mm4, mm3              ; abs(q0-q1)
     73         movq        t0, mm4               ; save to t0
     74         psubusb     mm4, mm7
     75         por        mm1, mm4
     76 
     77 
     78         neg         rax                   ; negate pitch to deal with above border
     79 
     80         movq        mm2, [rsi+4*rax]      ; p3
     81         movq        mm4, [rdi+4*rax]      ; p2
     82         movq        mm5, mm4              ; p2
     83         psubusb     mm4, mm2              ; p2-=p3
     84         psubusb     mm2, mm5              ; p3-=p2
     85         por         mm4, mm2              ; abs(p3 - p2)
     86         psubusb     mm4, mm7
     87         por        mm1, mm4
     88 
     89 
     90         movq        mm4, [rsi+2*rax]      ; p1
     91         movq        mm3, mm4              ; p1
     92         psubusb     mm4, mm5              ; p1-=p2
     93         psubusb     mm5, mm3              ; p2-=p1
     94         por         mm4, mm5              ; abs(p2 - p1)
     95         psubusb     mm4, mm7
     96         por        mm1, mm4
     97 
     98         movq        mm2, mm3              ; p1
     99 
    100         movq        mm4, [rsi+rax]        ; p0
    101         movq        mm5, mm4              ; p0
    102         psubusb     mm4, mm3              ; p0-=p1
    103         psubusb     mm3, mm5              ; p1-=p0
    104         por         mm4, mm3              ; abs(p1 - p0)
    105         movq        t1, mm4               ; save to t1
    106         psubusb     mm4, mm7
    107         por        mm1, mm4
    108 
    109         movq        mm3, [rdi]            ; q1
    110         movq        mm4, mm3              ; q1
    111         psubusb     mm3, mm2              ; q1-=p1
    112         psubusb     mm2, mm4              ; p1-=q1
    113         por         mm2, mm3              ; abs(p1-q1)
    114         pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
    115         psrlw       mm2, 1                ; abs(p1-q1)/2
    116 
    117         movq        mm6, mm5              ; p0
    118         movq        mm3, [rsi]            ; q0
    119         psubusb     mm5, mm3              ; p0-=q0
    120         psubusb     mm3, mm6              ; q0-=p0
    121         por         mm5, mm3              ; abs(p0 - q0)
    122         paddusb     mm5, mm5              ; abs(p0-q0)*2
    123         paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
    124 
    125         mov         rdx, arg(2) ;blimit           ; get blimit
    126         movq        mm7, [rdx]            ; blimit
    127 
    128         psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
    129         por         mm1,    mm5
    130         pxor        mm5,    mm5
    131         pcmpeqb     mm1,    mm5           ; mask mm1
    132 
    133         ; calculate high edge variance
    134         mov         rdx, arg(4) ;thresh           ; get thresh
    135         movq        mm7, [rdx]            ;
    136         movq        mm4, t0               ; get abs (q1 - q0)
    137         psubusb     mm4, mm7
    138         movq        mm3, t1               ; get abs (p1 - p0)
    139         psubusb     mm3, mm7
    140         paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
    141 
    142         pcmpeqb     mm4,        mm5
    143 
    144         pcmpeqb     mm5,        mm5
    145         pxor        mm4,        mm5
    146 
    147 
    148         ; start work on filters
    149         movq        mm2, [rsi+2*rax]      ; p1
    150         movq        mm7, [rdi]            ; q1
    151         pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
    152         pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
    153         psubsb      mm2, mm7              ; p1 - q1
    154         pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
    155         pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
    156         pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
    157         movq        mm3, mm0              ; q0
    158         psubsb      mm0, mm6              ; q0 - p0
    159         paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
    160         paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
    161         paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
    162         pand        mm1, mm2                  ; mask filter values we don't care about
    163         movq        mm2, mm1
    164         paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
    165         paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
    166 
    167         pxor        mm0, mm0             ;
    168         pxor        mm5, mm5
    169         punpcklbw   mm0, mm2            ;
    170         punpckhbw   mm5, mm2            ;
    171         psraw       mm0, 11             ;
    172         psraw       mm5, 11
    173         packsswb    mm0, mm5
    174         movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
    175 
    176         pxor        mm0, mm0              ; 0
    177         movq        mm5, mm1              ; abcdefgh
    178         punpcklbw   mm0, mm1              ; e0f0g0h0
    179         psraw       mm0, 11               ; sign extended shift right by 3
    180         pxor        mm1, mm1              ; 0
    181         punpckhbw   mm1, mm5              ; a0b0c0d0
    182         psraw       mm1, 11               ; sign extended shift right by 3
    183         movq        mm5, mm0              ; save results
    184 
    185         packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
    186         paddsw      mm5, [GLOBAL(ones)]
    187         paddsw      mm1, [GLOBAL(ones)]
    188         psraw       mm5, 1                ; partial shifted one more time for 2nd tap
    189         psraw       mm1, 1                ; partial shifted one more time for 2nd tap
    190         packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
    191         pandn       mm4, mm5              ; high edge variance additive
    192 
    193         paddsb      mm6, mm2              ; p0+= p0 add
    194         pxor        mm6, [GLOBAL(t80)]    ; unoffset
    195         movq        [rsi+rax], mm6        ; write back
    196 
    197         movq        mm6, [rsi+2*rax]      ; p1
    198         pxor        mm6, [GLOBAL(t80)]    ; reoffset
    199         paddsb      mm6, mm4              ; p1+= p1 add
    200         pxor        mm6, [GLOBAL(t80)]    ; unoffset
    201         movq        [rsi+2*rax], mm6      ; write back
    202 
    203         psubsb      mm3, mm0              ; q0-= q0 add
    204         pxor        mm3, [GLOBAL(t80)]    ; unoffset
    205         movq        [rsi], mm3            ; write back
    206 
    207         psubsb      mm7, mm4              ; q1-= q1 add
    208         pxor        mm7, [GLOBAL(t80)]    ; unoffset
    209         movq        [rdi], mm7            ; write back
    210 
    211         add         rsi,8
    212         neg         rax
    213         dec         rcx
    214         jnz         .next8_h
    215 
    216     add rsp, 32
    217     pop rsp
    218     ; begin epilog
    219     pop rdi
    220     pop rsi
    221     RESTORE_GOT
    222     UNSHADOW_ARGS
    223     pop         rbp
    224     ret
    225 
    226 
    227 ;void vp9_lpf_vertical_4_mmx
    228 ;(
    229 ;    unsigned char *src_ptr,
    230 ;    int  src_pixel_step,
    231 ;    const char *blimit,
    232 ;    const char *limit,
    233 ;    const char *thresh,
    234 ;    int count
    235 ;)
    236 global sym(vp9_lpf_vertical_4_mmx) PRIVATE
    237 sym(vp9_lpf_vertical_4_mmx):
    238     push        rbp
    239     mov         rbp, rsp
    240     SHADOW_ARGS_TO_STACK 6
    241     GET_GOT     rbx
    242     push        rsi
    243     push        rdi
    244     ; end prolog
    245 
    246     ALIGN_STACK 16, rax
    247     sub          rsp, 64      ; reserve 64 bytes
    248     %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
    249     %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
    250     %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
    251 
    252         mov         rsi,        arg(0) ;src_ptr
    253         movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
    254 
    255         lea         rsi,        [rsi + rax*4 - 4]
    256 
    257         movsxd      rcx,        dword ptr arg(5) ;count
    258 .next8_v:
    259         mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
    260         add         rdi,        rax
    261 
    262 
    263         ;transpose
    264         movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
    265         movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
    266 
    267         punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
    268         punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
    269 
    270         movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
    271         movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
    272 
    273         punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
    274         punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
    275 
    276         movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
    277         punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
    278 
    279         punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
    280         movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
    281 
    282         punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
    283         punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
    284 
    285         neg         rax
    286         movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
    287 
    288         movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
    289         punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
    290 
    291         punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
    292         movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
    293 
    294         punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
    295         movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
    296 
    297         punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
    298         punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
    299 
    300         movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
    301         punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
    302 
    303         punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
    304 
    305         movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
    306         psubusb     mm5,        mm7                         ; q2-q3
    307 
    308         psubusb     mm7,        mm6                         ; q3-q2
    309         por         mm7,        mm5;                        ; mm7=abs (q3-q2)
    310 
    311         movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
    312         punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
    313 
    314         punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
    315         movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
    316 
    317         psubusb     mm3,        mm6                         ; q1-q2
    318         psubusb     mm6,        mm5                         ; q2-q1
    319 
    320         por         mm6,        mm3                         ; mm6=abs(q2-q1)
    321         lea         rdx,        srct
    322 
    323         movq        [rdx+24],   mm5                         ; save q1
    324         movq        [rdx+16],   mm0                         ; save q0
    325 
    326         movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
    327         punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
    328 
    329         movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
    330         punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
    331 
    332         punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
    333         movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
    334 
    335         punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
    336         punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
    337 
    338         movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
    339         psubusb     mm2,        mm0                         ; p2-p3
    340 
    341         psubusb     mm0,        mm1                         ; p3-p2
    342         por         mm0,        mm2                         ; mm0=abs(p3-p2)
    343 
    344         movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
    345         punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
    346 
    347         punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
    348         movq        [rdx+8],    mm3                         ; save p0
    349 
    350         movq        [rdx],      mm2                         ; save p1
    351         movq        mm5,        mm2                         ; mm5 = p1
    352 
    353         psubusb     mm2,        mm1                         ; p1-p2
    354         psubusb     mm1,        mm5                         ; p2-p1
    355 
    356         por         mm1,        mm2                         ; mm1=abs(p2-p1)
    357         mov         rdx,        arg(3) ;limit
    358 
    359         movq        mm4,        [rdx]                       ; mm4 = limit
    360         psubusb     mm7,        mm4
    361 
    362         psubusb     mm0,        mm4
    363         psubusb     mm1,        mm4
    364 
    365         psubusb     mm6,        mm4
    366         por         mm7,        mm6
    367 
    368         por         mm0,        mm1
    369         por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
    370 
    371         movq        mm1,        mm5                         ; p1
    372 
    373         movq        mm7,        mm3                         ; mm3=mm7=p0
    374         psubusb     mm7,        mm5                         ; p0 - p1
    375 
    376         psubusb     mm5,        mm3                         ; p1 - p0
    377         por         mm5,        mm7                         ; abs(p1-p0)
    378 
    379         movq        t0,         mm5                         ; save abs(p1-p0)
    380         lea         rdx,        srct
    381 
    382         psubusb     mm5,        mm4
    383         por         mm0,        mm5                         ; mm0=mask
    384 
    385         movq        mm5,        [rdx+16]                    ; mm5=q0
    386         movq        mm7,        [rdx+24]                    ; mm7=q1
    387 
    388         movq        mm6,        mm5                         ; mm6=q0
    389         movq        mm2,        mm7                         ; q1
    390         psubusb     mm5,        mm7                         ; q0-q1
    391 
    392         psubusb     mm7,        mm6                         ; q1-q0
    393         por         mm7,        mm5                         ; abs(q1-q0)
    394 
    395         movq        t1,         mm7                         ; save abs(q1-q0)
    396         psubusb     mm7,        mm4
    397 
    398         por         mm0,        mm7                         ; mask
    399 
    400         movq        mm5,        mm2                         ; q1
    401         psubusb     mm5,        mm1                         ; q1-=p1
    402         psubusb     mm1,        mm2                         ; p1-=q1
    403         por         mm5,        mm1                         ; abs(p1-q1)
    404         pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
    405         psrlw       mm5,        1                           ; abs(p1-q1)/2
    406 
    407         mov         rdx,        arg(2) ;blimit                      ;
    408 
    409         movq        mm4,        [rdx]                       ;blimit
    410         movq        mm1,        mm3                         ; mm1=mm3=p0
    411 
    412         movq        mm7,        mm6                         ; mm7=mm6=q0
    413         psubusb     mm1,        mm7                         ; p0-q0
    414 
    415         psubusb     mm7,        mm3                         ; q0-p0
    416         por         mm1,        mm7                         ; abs(q0-p0)
    417         paddusb     mm1,        mm1                         ; abs(q0-p0)*2
    418         paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
    419 
    420         psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
    421         por         mm1,        mm0;                        ; mask
    422 
    423         pxor        mm0,        mm0
    424         pcmpeqb     mm1,        mm0
    425 
    426         ; calculate high edge variance
    427         mov         rdx,        arg(4) ;thresh            ; get thresh
    428         movq        mm7,        [rdx]
    429         ;
    430         movq        mm4,        t0              ; get abs (q1 - q0)
    431         psubusb     mm4,        mm7
    432 
    433         movq        mm3,        t1              ; get abs (p1 - p0)
    434         psubusb     mm3,        mm7
    435 
    436         por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
    437         pcmpeqb     mm4,        mm0
    438 
    439         pcmpeqb     mm0,        mm0
    440         pxor        mm4,        mm0
    441 
    442 
    443 
    444         ; start work on filters
    445         lea         rdx,        srct
    446 
    447         movq        mm2,        [rdx]           ; p1
    448         movq        mm7,        [rdx+24]        ; q1
    449 
    450         movq        mm6,        [rdx+8]         ; p0
    451         movq        mm0,        [rdx+16]        ; q0
    452 
    453         pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values
    454         pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values
    455 
    456         psubsb      mm2,        mm7             ; p1 - q1
    457         pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
    458 
    459         pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values
    460         pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values
    461 
    462         movq        mm3,        mm0             ; q0
    463         psubsb      mm0,        mm6             ; q0 - p0
    464 
    465         paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
    466         paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
    467 
    468         paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
    469         pand       mm1,        mm2              ; mask filter values we don't care about
    470 
    471         movq        mm2,        mm1
    472         paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
    473 
    474         paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
    475         pxor        mm0,        mm0          ;
    476 
    477         pxor        mm5,        mm5
    478         punpcklbw   mm0,        mm2         ;
    479 
    480         punpckhbw   mm5,        mm2         ;
    481         psraw       mm0,        11              ;
    482 
    483         psraw       mm5,        11
    484         packsswb    mm0,        mm5
    485 
    486         movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
    487 
    488         pxor        mm0,        mm0           ; 0
    489         movq        mm5,        mm1           ; abcdefgh
    490 
    491         punpcklbw   mm0,        mm1           ; e0f0g0h0
    492         psraw       mm0,        11                ; sign extended shift right by 3
    493 
    494         pxor        mm1,        mm1           ; 0
    495         punpckhbw   mm1,        mm5           ; a0b0c0d0
    496 
    497         psraw       mm1,        11                ; sign extended shift right by 3
    498         movq        mm5,        mm0              ; save results
    499 
    500         packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
    501         paddsw      mm5,        [GLOBAL(ones)]
    502 
    503         paddsw      mm1,        [GLOBAL(ones)]
    504         psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
    505 
    506         psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
    507         packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
    508 
    509         pandn       mm4,        mm5             ; high edge variance additive
    510 
    511         paddsb      mm6,        mm2             ; p0+= p0 add
    512         pxor        mm6,        [GLOBAL(t80)]   ; unoffset
    513 
    514         ; mm6=p0                               ;
    515         movq        mm1,        [rdx]           ; p1
    516         pxor        mm1,        [GLOBAL(t80)]   ; reoffset
    517 
    518         paddsb      mm1,        mm4                 ; p1+= p1 add
    519         pxor        mm1,        [GLOBAL(t80)]       ; unoffset
    520         ; mm6 = p0 mm1 = p1
    521 
    522         psubsb      mm3,        mm0                 ; q0-= q0 add
    523         pxor        mm3,        [GLOBAL(t80)]       ; unoffset
    524 
    525         ; mm3 = q0
    526         psubsb      mm7,        mm4                 ; q1-= q1 add
    527         pxor        mm7,        [GLOBAL(t80)]       ; unoffset
    528         ; mm7 = q1
    529 
    530         ; transpose and write back
    531         ; mm1 =    72 62 52 42 32 22 12 02
    532         ; mm6 =    73 63 53 43 33 23 13 03
    533         ; mm3 =    74 64 54 44 34 24 14 04
    534         ; mm7 =    75 65 55 45 35 25 15 05
    535 
    536         movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
    537         punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
    538 
    539         movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
    540         punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
    541 
    542         punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
    543         punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
    544 
    545         movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
    546         punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
    547 
    548         punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
    549         movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
    550 
    551         punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
    552         punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
    553 
    554 
    555         ; mm2 = 15 14 13 12 05 04 03 02
    556         ; mm6 = 35 34 33 32 25 24 23 22
    557         ; mm5 = 55 54 53 52 45 44 43 42
    558         ; mm1 = 75 74 73 72 65 64 63 62
    559 
    560 
    561 
    562         movd        [rsi+rax*4+2], mm2
    563         psrlq       mm2,        32
    564 
    565         movd        [rdi+rax*4+2], mm2
    566         movd        [rsi+rax*2+2], mm6
    567 
    568         psrlq       mm6,        32
    569         movd        [rsi+rax+2],mm6
    570 
    571         movd        [rsi+2],    mm1
    572         psrlq       mm1,        32
    573 
    574         movd        [rdi+2],    mm1
    575         neg         rax
    576 
    577         movd        [rdi+rax+2],mm5
    578         psrlq       mm5,        32
    579 
    580         movd        [rdi+rax*2+2], mm5
    581 
    582         lea         rsi,        [rsi+rax*8]
    583         dec         rcx
    584         jnz         .next8_v
    585 
    586     add rsp, 64
    587     pop rsp
    588     ; begin epilog
    589     pop rdi
    590     pop rsi
    591     RESTORE_GOT
    592     UNSHADOW_ARGS
    593     pop         rbp
    594     ret
    595 
    596 SECTION_RODATA
    597 align 16
    598 tfe:
    599     times 8 db 0xfe
    600 align 16
    601 t80:
    602     times 8 db 0x80
    603 align 16
    604 t1s:
    605     times 8 db 0x01
    606 align 16
    607 t3:
    608     times 8 db 0x03
    609 align 16
    610 t4:
    611     times 8 db 0x04
    612 align 16
    613 ones:
    614     times 4 dw 0x0001
    615 align 16
    616 s27:
    617     times 4 dw 0x1b00
    618 align 16
    619 s18:
    620     times 4 dw 0x1200
    621 align 16
    622 s9:
    623     times 4 dw 0x0900
    624 align 16
    625 s63:
    626     times 4 dw 0x003f
    627