Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %include "third_party/x86inc/x86inc.asm"
     12 
     13 SECTION_RODATA
     14 pw_64:    times 8 dw 64
     15 
     16 ; %define USE_PMULHRSW
     17 ; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
     18 ; when using this instruction.
     19 ;
     20 ; The add order below (based on ffvp9) must be followed to prevent outranges.
     21 ; x = k0k1 + k4k5
     22 ; y = k2k3 + k6k7
     23 ; z = signed SAT(x + y)
     24 
     25 SECTION .text
     26 %define LOCAL_VARS_SIZE 16*6
     27 
     28 %macro SETUP_LOCAL_VARS 0
     29     ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
     30     ; pmaddubsw has a higher latency on some platforms, this might be eased by
     31     ; interleaving the instructions.
     32     %define    k0k1  [rsp + 16*0]
     33     %define    k2k3  [rsp + 16*1]
     34     %define    k4k5  [rsp + 16*2]
     35     %define    k6k7  [rsp + 16*3]
     36     packsswb     m4, m4
     37     ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
     38     ; some platforms.
     39     pshuflw      m0, m4, 0b              ;k0_k1
     40     pshuflw      m1, m4, 01010101b       ;k2_k3
     41     pshuflw      m2, m4, 10101010b       ;k4_k5
     42     pshuflw      m3, m4, 11111111b       ;k6_k7
     43     punpcklqdq   m0, m0
     44     punpcklqdq   m1, m1
     45     punpcklqdq   m2, m2
     46     punpcklqdq   m3, m3
     47     mova       k0k1, m0
     48     mova       k2k3, m1
     49     mova       k4k5, m2
     50     mova       k6k7, m3
     51 %if ARCH_X86_64
     52     %define     krd  m12
     53     %define    tmp0  [rsp + 16*4]
     54     %define    tmp1  [rsp + 16*5]
     55     mova        krd, [GLOBAL(pw_64)]
     56 %else
     57     %define     krd  [rsp + 16*4]
     58 %if CONFIG_PIC=0
     59     mova         m6, [GLOBAL(pw_64)]
     60 %else
     61     ; build constants without accessing global memory
     62     pcmpeqb      m6, m6                  ;all ones
     63     psrlw        m6, 15
     64     psllw        m6, 6                   ;aka pw_64
     65 %endif
     66     mova        krd, m6
     67 %endif
     68 %endm
     69 
     70 ;-------------------------------------------------------------------------------
     71 %if ARCH_X86_64
     72   %define LOCAL_VARS_SIZE_H4 0
     73 %else
     74   %define LOCAL_VARS_SIZE_H4 16*4
     75 %endif
     76 
     77 %macro SUBPIX_HFILTER4 1
     78 cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
     79                             src, sstride, dst, dstride, height, filter
     80     mova                m4, [filterq]
     81     packsswb            m4, m4
     82 %if ARCH_X86_64
     83     %define       k0k1k4k5  m8
     84     %define       k2k3k6k7  m9
     85     %define            krd  m10
     86     mova               krd, [GLOBAL(pw_64)]
     87     pshuflw       k0k1k4k5, m4, 0b              ;k0_k1
     88     pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
     89     pshuflw       k2k3k6k7, m4, 01010101b       ;k2_k3
     90     pshufhw       k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
     91 %else
     92     %define       k0k1k4k5  [rsp + 16*0]
     93     %define       k2k3k6k7  [rsp + 16*1]
     94     %define            krd  [rsp + 16*2]
     95     pshuflw             m6, m4, 0b              ;k0_k1
     96     pshufhw             m6, m6, 10101010b       ;k0_k1_k4_k5
     97     pshuflw             m7, m4, 01010101b       ;k2_k3
     98     pshufhw             m7, m7, 11111111b       ;k2_k3_k6_k7
     99 %if CONFIG_PIC=0
    100     mova                m1, [GLOBAL(pw_64)]
    101 %else
    102     ; build constants without accessing global memory
    103     pcmpeqb             m1, m1                  ;all ones
    104     psrlw               m1, 15
    105     psllw               m1, 6                   ;aka pw_64
    106 %endif
    107     mova          k0k1k4k5, m6
    108     mova          k2k3k6k7, m7
    109     mova               krd, m1
    110 %endif
    111     dec            heightd
    112 
    113 .loop:
    114     ;Do two rows at once
    115     movu                m4, [srcq - 3]
    116     movu                m5, [srcq + sstrideq - 3]
    117     punpckhbw           m1, m4, m4
    118     punpcklbw           m4, m4
    119     punpckhbw           m3, m5, m5
    120     punpcklbw           m5, m5
    121     palignr             m0, m1, m4, 1
    122     pmaddubsw           m0, k0k1k4k5
    123     palignr             m1, m4, 5
    124     pmaddubsw           m1, k2k3k6k7
    125     palignr             m2, m3, m5, 1
    126     pmaddubsw           m2, k0k1k4k5
    127     palignr             m3, m5, 5
    128     pmaddubsw           m3, k2k3k6k7
    129     punpckhqdq          m4, m0, m2
    130     punpcklqdq          m0, m2
    131     punpckhqdq          m5, m1, m3
    132     punpcklqdq          m1, m3
    133     paddsw              m0, m4
    134     paddsw              m1, m5
    135 %ifidn %1, h8_avg
    136     movd                m4, [dstq]
    137     movd                m5, [dstq + dstrideq]
    138 %endif
    139     paddsw              m0, m1
    140     paddsw              m0, krd
    141     psraw               m0, 7
    142     packuswb            m0, m0
    143     psrldq              m1, m0, 4
    144 
    145 %ifidn %1, h8_avg
    146     pavgb               m0, m4
    147     pavgb               m1, m5
    148 %endif
    149     movd            [dstq], m0
    150     movd [dstq + dstrideq], m1
    151 
    152     lea               srcq, [srcq + sstrideq        ]
    153     prefetcht0              [srcq + 4 * sstrideq - 3]
    154     lea               srcq, [srcq + sstrideq        ]
    155     lea               dstq, [dstq + 2 * dstrideq    ]
    156     prefetcht0              [srcq + 2 * sstrideq - 3]
    157 
    158     sub            heightd, 2
    159     jg               .loop
    160 
    161     ; Do last row if output_height is odd
    162     jne              .done
    163 
    164     movu                m4, [srcq - 3]
    165     punpckhbw           m1, m4, m4
    166     punpcklbw           m4, m4
    167     palignr             m0, m1, m4, 1
    168     palignr             m1, m4, 5
    169     pmaddubsw           m0, k0k1k4k5
    170     pmaddubsw           m1, k2k3k6k7
    171     psrldq              m2, m0, 8
    172     psrldq              m3, m1, 8
    173     paddsw              m0, m2
    174     paddsw              m1, m3
    175     paddsw              m0, m1
    176     paddsw              m0, krd
    177     psraw               m0, 7
    178     packuswb            m0, m0
    179 %ifidn %1, h8_avg
    180     movd                m4, [dstq]
    181     pavgb               m0, m4
    182 %endif
    183     movd            [dstq], m0
    184 .done:
    185     REP_RET
    186 %endm
    187 
    188 ;-------------------------------------------------------------------------------
    189 %macro SUBPIX_HFILTER8 1
    190 cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
    191                             src, sstride, dst, dstride, height, filter
    192     mova                 m4, [filterq]
    193     SETUP_LOCAL_VARS
    194     dec             heightd
    195 
    196 .loop:
    197     ;Do two rows at once
    198     movu                 m0, [srcq - 3]
    199     movu                 m4, [srcq + sstrideq - 3]
    200     punpckhbw            m1, m0, m0
    201     punpcklbw            m0, m0
    202     palignr              m5, m1, m0, 13
    203     pmaddubsw            m5, k6k7
    204     palignr              m2, m1, m0, 5
    205     palignr              m3, m1, m0, 9
    206     palignr              m1, m0, 1
    207     pmaddubsw            m1, k0k1
    208     punpckhbw            m6, m4, m4
    209     punpcklbw            m4, m4
    210     pmaddubsw            m2, k2k3
    211     pmaddubsw            m3, k4k5
    212 
    213     palignr              m7, m6, m4, 13
    214     palignr              m0, m6, m4, 5
    215     pmaddubsw            m7, k6k7
    216     paddsw               m1, m3
    217     paddsw               m2, m5
    218     paddsw               m1, m2
    219 %ifidn %1, h8_avg
    220     movh                 m2, [dstq]
    221     movhps               m2, [dstq + dstrideq]
    222 %endif
    223     palignr              m5, m6, m4, 9
    224     palignr              m6, m4, 1
    225     pmaddubsw            m0, k2k3
    226     pmaddubsw            m6, k0k1
    227     paddsw               m1, krd
    228     pmaddubsw            m5, k4k5
    229     psraw                m1, 7
    230     paddsw               m0, m7
    231     paddsw               m6, m5
    232     paddsw               m6, m0
    233     paddsw               m6, krd
    234     psraw                m6, 7
    235     packuswb             m1, m6
    236 %ifidn %1, h8_avg
    237     pavgb                m1, m2
    238 %endif
    239     movh              [dstq], m1
    240     movhps [dstq + dstrideq], m1
    241 
    242     lea                srcq, [srcq + sstrideq        ]
    243     prefetcht0               [srcq + 4 * sstrideq - 3]
    244     lea                srcq, [srcq + sstrideq        ]
    245     lea                dstq, [dstq + 2 * dstrideq    ]
    246     prefetcht0               [srcq + 2 * sstrideq - 3]
    247     sub             heightd, 2
    248     jg                .loop
    249 
    250     ; Do last row if output_height is odd
    251     jne               .done
    252 
    253     movu                 m0, [srcq - 3]
    254     punpckhbw            m3, m0, m0
    255     punpcklbw            m0, m0
    256     palignr              m1, m3, m0, 1
    257     palignr              m2, m3, m0, 5
    258     palignr              m4, m3, m0, 13
    259     palignr              m3, m0, 9
    260     pmaddubsw            m1, k0k1
    261     pmaddubsw            m2, k2k3
    262     pmaddubsw            m3, k4k5
    263     pmaddubsw            m4, k6k7
    264     paddsw               m1, m3
    265     paddsw               m4, m2
    266     paddsw               m1, m4
    267     paddsw               m1, krd
    268     psraw                m1, 7
    269     packuswb             m1, m1
    270 %ifidn %1, h8_avg
    271     movh                 m0, [dstq]
    272     pavgb                m1, m0
    273 %endif
    274     movh             [dstq], m1
    275 .done:
    276     REP_RET
    277 %endm
    278 
    279 ;-------------------------------------------------------------------------------
    280 %macro SUBPIX_HFILTER16 1
    281 cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
    282                              src, sstride, dst, dstride, height, filter
    283     mova          m4, [filterq]
    284     SETUP_LOCAL_VARS
    285 
    286 .loop:
    287     prefetcht0        [srcq + 2 * sstrideq -3]
    288 
    289     movu          m0, [srcq - 3]
    290     movu          m4, [srcq - 2]
    291     pmaddubsw     m0, k0k1
    292     pmaddubsw     m4, k0k1
    293     movu          m1, [srcq - 1]
    294     movu          m5, [srcq + 0]
    295     pmaddubsw     m1, k2k3
    296     pmaddubsw     m5, k2k3
    297     movu          m2, [srcq + 1]
    298     movu          m6, [srcq + 2]
    299     pmaddubsw     m2, k4k5
    300     pmaddubsw     m6, k4k5
    301     movu          m3, [srcq + 3]
    302     movu          m7, [srcq + 4]
    303     pmaddubsw     m3, k6k7
    304     pmaddubsw     m7, k6k7
    305     paddsw        m0, m2
    306     paddsw        m1, m3
    307     paddsw        m0, m1
    308     paddsw        m4, m6
    309     paddsw        m5, m7
    310     paddsw        m4, m5
    311     paddsw        m0, krd
    312     paddsw        m4, krd
    313     psraw         m0, 7
    314     psraw         m4, 7
    315     packuswb      m0, m0
    316     packuswb      m4, m4
    317     punpcklbw     m0, m4
    318 %ifidn %1, h8_avg
    319     pavgb         m0, [dstq]
    320 %endif
    321     lea         srcq, [srcq + sstrideq]
    322     mova      [dstq], m0
    323     lea         dstq, [dstq + dstrideq]
    324     dec      heightd
    325     jnz        .loop
    326     REP_RET
    327 %endm
    328 
    329 INIT_XMM ssse3
    330 SUBPIX_HFILTER16 h8      ; vpx_filter_block1d16_h8_ssse3
    331 SUBPIX_HFILTER16 h8_avg  ; vpx_filter_block1d16_h8_avg_ssse3
    332 SUBPIX_HFILTER8  h8      ; vpx_filter_block1d8_h8_ssse3
    333 SUBPIX_HFILTER8  h8_avg  ; vpx_filter_block1d8_h8_avg_ssse3
    334 SUBPIX_HFILTER4  h8      ; vpx_filter_block1d4_h8_ssse3
    335 SUBPIX_HFILTER4  h8_avg  ; vpx_filter_block1d4_h8_avg_ssse3
    336 
    337 ;-------------------------------------------------------------------------------
    338 
    339 ; TODO(Linfeng): Detect cpu type and choose the code with better performance.
    340 %define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
    341 
    342 %if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
    343     %define NUM_GENERAL_REG_USED 9
    344 %else
    345     %define NUM_GENERAL_REG_USED 6
    346 %endif
    347 
    348 %macro SUBPIX_VFILTER 2
    349 cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
    350                              src, sstride, dst, dstride, height, filter
    351     mova          m4, [filterq]
    352     SETUP_LOCAL_VARS
    353 
    354 %ifidn %2, 8
    355     %define                movx  movh
    356 %else
    357     %define                movx  movd
    358 %endif
    359 
    360     dec                 heightd
    361 
    362 %if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
    363 
    364 %if ARCH_X86_64
    365     %define               src1q  r7
    366     %define           sstride6q  r8
    367     %define          dst_stride  dstrideq
    368 %else
    369     %define               src1q  filterq
    370     %define           sstride6q  dstrideq
    371     %define          dst_stride  dstridemp
    372 %endif
    373     mov                   src1q, srcq
    374     add                   src1q, sstrideq
    375     lea               sstride6q, [sstrideq + sstrideq * 4]
    376     add               sstride6q, sstrideq                   ;pitch * 6
    377 
    378 .loop:
    379     ;Do two rows at once
    380     movx                     m0, [srcq                ]     ;A
    381     movx                     m1, [src1q               ]     ;B
    382     punpcklbw                m0, m1                         ;A B
    383     movx                     m2, [srcq + sstrideq * 2 ]     ;C
    384     pmaddubsw                m0, k0k1
    385     mova                     m6, m2
    386     movx                     m3, [src1q + sstrideq * 2]     ;D
    387     punpcklbw                m2, m3                         ;C D
    388     pmaddubsw                m2, k2k3
    389     movx                     m4, [srcq + sstrideq * 4 ]     ;E
    390     mova                     m7, m4
    391     movx                     m5, [src1q + sstrideq * 4]     ;F
    392     punpcklbw                m4, m5                         ;E F
    393     pmaddubsw                m4, k4k5
    394     punpcklbw                m1, m6                         ;A B next iter
    395     movx                     m6, [srcq + sstride6q    ]     ;G
    396     punpcklbw                m5, m6                         ;E F next iter
    397     punpcklbw                m3, m7                         ;C D next iter
    398     pmaddubsw                m5, k4k5
    399     movx                     m7, [src1q + sstride6q   ]     ;H
    400     punpcklbw                m6, m7                         ;G H
    401     pmaddubsw                m6, k6k7
    402     pmaddubsw                m3, k2k3
    403     pmaddubsw                m1, k0k1
    404     paddsw                   m0, m4
    405     paddsw                   m2, m6
    406     movx                     m6, [srcq + sstrideq * 8 ]     ;H next iter
    407     punpcklbw                m7, m6
    408     pmaddubsw                m7, k6k7
    409     paddsw                   m0, m2
    410     paddsw                   m0, krd
    411     psraw                    m0, 7
    412     paddsw                   m1, m5
    413     packuswb                 m0, m0
    414 
    415     paddsw                   m3, m7
    416     paddsw                   m1, m3
    417     paddsw                   m1, krd
    418     psraw                    m1, 7
    419     lea                    srcq, [srcq + sstrideq * 2 ]
    420     lea                   src1q, [src1q + sstrideq * 2]
    421     packuswb                 m1, m1
    422 
    423 %ifidn %1, v8_avg
    424     movx                     m2, [dstq]
    425     pavgb                    m0, m2
    426 %endif
    427     movx                 [dstq], m0
    428     add                    dstq, dst_stride
    429 %ifidn %1, v8_avg
    430     movx                     m3, [dstq]
    431     pavgb                    m1, m3
    432 %endif
    433     movx                 [dstq], m1
    434     add                    dstq, dst_stride
    435     sub                 heightd, 2
    436     jg                    .loop
    437 
    438     ; Do last row if output_height is odd
    439     jne                   .done
    440 
    441     movx                     m0, [srcq                ]     ;A
    442     movx                     m1, [srcq + sstrideq     ]     ;B
    443     movx                     m6, [srcq + sstride6q    ]     ;G
    444     punpcklbw                m0, m1                         ;A B
    445     movx                     m7, [src1q + sstride6q   ]     ;H
    446     pmaddubsw                m0, k0k1
    447     movx                     m2, [srcq + sstrideq * 2 ]     ;C
    448     punpcklbw                m6, m7                         ;G H
    449     movx                     m3, [src1q + sstrideq * 2]     ;D
    450     pmaddubsw                m6, k6k7
    451     movx                     m4, [srcq + sstrideq * 4 ]     ;E
    452     punpcklbw                m2, m3                         ;C D
    453     movx                     m5, [src1q + sstrideq * 4]     ;F
    454     punpcklbw                m4, m5                         ;E F
    455     pmaddubsw                m2, k2k3
    456     pmaddubsw                m4, k4k5
    457     paddsw                   m2, m6
    458     paddsw                   m0, m4
    459     paddsw                   m0, m2
    460     paddsw                   m0, krd
    461     psraw                    m0, 7
    462     packuswb                 m0, m0
    463 %ifidn %1, v8_avg
    464     movx                     m1, [dstq]
    465     pavgb                    m0, m1
    466 %endif
    467     movx                 [dstq], m0
    468 
    469 %else
    470     ; ARCH_X86_64
    471 
    472     movx                     m0, [srcq                ]     ;A
    473     movx                     m1, [srcq + sstrideq     ]     ;B
    474     lea                    srcq, [srcq + sstrideq * 2 ]
    475     movx                     m2, [srcq]                     ;C
    476     movx                     m3, [srcq + sstrideq]          ;D
    477     lea                    srcq, [srcq + sstrideq * 2 ]
    478     movx                     m4, [srcq]                     ;E
    479     movx                     m5, [srcq + sstrideq]          ;F
    480     lea                    srcq, [srcq + sstrideq * 2 ]
    481     movx                     m6, [srcq]                     ;G
    482     punpcklbw                m0, m1                         ;A B
    483     punpcklbw                m1, m2                         ;A B next iter
    484     punpcklbw                m2, m3                         ;C D
    485     punpcklbw                m3, m4                         ;C D next iter
    486     punpcklbw                m4, m5                         ;E F
    487     punpcklbw                m5, m6                         ;E F next iter
    488 
    489 .loop:
    490     ;Do two rows at once
    491     movx                     m7, [srcq + sstrideq]          ;H
    492     lea                    srcq, [srcq + sstrideq * 2 ]
    493     movx                    m14, [srcq]                     ;H next iter
    494     punpcklbw                m6, m7                         ;G H
    495     punpcklbw                m7, m14                        ;G H next iter
    496     pmaddubsw                m8, m0, k0k1
    497     pmaddubsw                m9, m1, k0k1
    498     mova                     m0, m2
    499     mova                     m1, m3
    500     pmaddubsw               m10, m2, k2k3
    501     pmaddubsw               m11, m3, k2k3
    502     mova                     m2, m4
    503     mova                     m3, m5
    504     pmaddubsw                m4, k4k5
    505     pmaddubsw                m5, k4k5
    506     paddsw                   m8, m4
    507     paddsw                   m9, m5
    508     mova                     m4, m6
    509     mova                     m5, m7
    510     pmaddubsw                m6, k6k7
    511     pmaddubsw                m7, k6k7
    512     paddsw                  m10, m6
    513     paddsw                  m11, m7
    514     paddsw                   m8, m10
    515     paddsw                   m9, m11
    516     mova                     m6, m14
    517     paddsw                   m8, krd
    518     paddsw                   m9, krd
    519     psraw                    m8, 7
    520     psraw                    m9, 7
    521 %ifidn %2, 4
    522     packuswb                 m8, m8
    523     packuswb                 m9, m9
    524 %else
    525     packuswb                 m8, m9
    526 %endif
    527 
    528 %ifidn %1, v8_avg
    529     movx                     m7, [dstq]
    530 %ifidn %2, 4
    531     movx                    m10, [dstq + dstrideq]
    532     pavgb                    m9, m10
    533 %else
    534     movhpd                   m7, [dstq + dstrideq]
    535 %endif
    536     pavgb                    m8, m7
    537 %endif
    538     movx                 [dstq], m8
    539 %ifidn %2, 4
    540     movx      [dstq + dstrideq], m9
    541 %else
    542     movhpd    [dstq + dstrideq], m8
    543 %endif
    544 
    545     lea                    dstq, [dstq + dstrideq * 2 ]
    546     sub                 heightd, 2
    547     jg                    .loop
    548 
    549     ; Do last row if output_height is odd
    550     jne                   .done
    551 
    552     movx                     m7, [srcq + sstrideq]          ;H
    553     punpcklbw                m6, m7                         ;G H
    554     pmaddubsw                m0, k0k1
    555     pmaddubsw                m2, k2k3
    556     pmaddubsw                m4, k4k5
    557     pmaddubsw                m6, k6k7
    558     paddsw                   m0, m4
    559     paddsw                   m2, m6
    560     paddsw                   m0, m2
    561     paddsw                   m0, krd
    562     psraw                    m0, 7
    563     packuswb                 m0, m0
    564 %ifidn %1, v8_avg
    565     movx                     m1, [dstq]
    566     pavgb                    m0, m1
    567 %endif
    568     movx                 [dstq], m0
    569 
    570 %endif ; ARCH_X86_64
    571 
    572 .done:
    573     REP_RET
    574 
    575 %endm
    576 
    577 ;-------------------------------------------------------------------------------
    578 %macro SUBPIX_VFILTER16 1
    579 cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
    580                              src, sstride, dst, dstride, height, filter
    581     mova                     m4, [filterq]
    582     SETUP_LOCAL_VARS
    583 
    584 %if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
    585 
    586 %if ARCH_X86_64
    587     %define               src1q  r7
    588     %define           sstride6q  r8
    589     %define          dst_stride  dstrideq
    590 %else
    591     %define               src1q  filterq
    592     %define           sstride6q  dstrideq
    593     %define          dst_stride  dstridemp
    594 %endif
    595     lea                   src1q, [srcq + sstrideq]
    596     lea               sstride6q, [sstrideq + sstrideq * 4]
    597     add               sstride6q, sstrideq                   ;pitch * 6
    598 
    599 .loop:
    600     movh                     m0, [srcq                ]     ;A
    601     movh                     m1, [src1q               ]     ;B
    602     movh                     m2, [srcq + sstrideq * 2 ]     ;C
    603     movh                     m3, [src1q + sstrideq * 2]     ;D
    604     movh                     m4, [srcq + sstrideq * 4 ]     ;E
    605     movh                     m5, [src1q + sstrideq * 4]     ;F
    606 
    607     punpcklbw                m0, m1                         ;A B
    608     movh                     m6, [srcq + sstride6q]         ;G
    609     punpcklbw                m2, m3                         ;C D
    610     movh                     m7, [src1q + sstride6q]        ;H
    611     punpcklbw                m4, m5                         ;E F
    612     pmaddubsw                m0, k0k1
    613     movh                     m3, [srcq + 8]                 ;A
    614     pmaddubsw                m2, k2k3
    615     punpcklbw                m6, m7                         ;G H
    616     movh                     m5, [srcq + sstrideq + 8]      ;B
    617     pmaddubsw                m4, k4k5
    618     punpcklbw                m3, m5                         ;A B
    619     movh                     m7, [srcq + sstrideq * 2 + 8]  ;C
    620     pmaddubsw                m6, k6k7
    621     movh                     m5, [src1q + sstrideq * 2 + 8] ;D
    622     punpcklbw                m7, m5                         ;C D
    623     paddsw                   m2, m6
    624     pmaddubsw                m3, k0k1
    625     movh                     m1, [srcq + sstrideq * 4 + 8]  ;E
    626     paddsw                   m0, m4
    627     pmaddubsw                m7, k2k3
    628     movh                     m6, [src1q + sstrideq * 4 + 8] ;F
    629     punpcklbw                m1, m6                         ;E F
    630     paddsw                   m0, m2
    631     paddsw                   m0, krd
    632     movh                     m2, [srcq + sstride6q + 8]     ;G
    633     pmaddubsw                m1, k4k5
    634     movh                     m5, [src1q + sstride6q + 8]    ;H
    635     psraw                    m0, 7
    636     punpcklbw                m2, m5                         ;G H
    637     pmaddubsw                m2, k6k7
    638     paddsw                   m7, m2
    639     paddsw                   m3, m1
    640     paddsw                   m3, m7
    641     paddsw                   m3, krd
    642     psraw                    m3, 7
    643     packuswb                 m0, m3
    644 
    645     add                    srcq, sstrideq
    646     add                   src1q, sstrideq
    647 %ifidn %1, v8_avg
    648     pavgb                    m0, [dstq]
    649 %endif
    650     mova                 [dstq], m0
    651     add                    dstq, dst_stride
    652     dec                 heightd
    653     jnz                   .loop
    654     REP_RET
    655 
    656 %else
    657     ; ARCH_X86_64
    658     dec                 heightd
    659 
    660     movu                     m1, [srcq                ]     ;A
    661     movu                     m3, [srcq + sstrideq     ]     ;B
    662     lea                    srcq, [srcq + sstrideq * 2]
    663     punpcklbw                m0, m1, m3                     ;A B
    664     punpckhbw                m1, m3                         ;A B
    665     movu                     m5, [srcq]                     ;C
    666     punpcklbw                m2, m3, m5                     ;A B next iter
    667     punpckhbw                m3, m5                         ;A B next iter
    668     mova                   tmp0, m2                         ;store to stack
    669     mova                   tmp1, m3                         ;store to stack
    670     movu                     m7, [srcq + sstrideq]          ;D
    671     lea                    srcq, [srcq + sstrideq * 2]
    672     punpcklbw                m4, m5, m7                     ;C D
    673     punpckhbw                m5, m7                         ;C D
    674     movu                     m9, [srcq]                     ;E
    675     punpcklbw                m6, m7, m9                     ;C D next iter
    676     punpckhbw                m7, m9                         ;C D next iter
    677     movu                    m11, [srcq + sstrideq]          ;F
    678     lea                    srcq, [srcq + sstrideq * 2]
    679     punpcklbw                m8, m9, m11                    ;E F
    680     punpckhbw                m9, m11                        ;E F
    681     movu                     m2, [srcq]                     ;G
    682     punpcklbw               m10, m11, m2                    ;E F next iter
    683     punpckhbw               m11, m2                         ;E F next iter
    684 
    685 .loop:
    686     ;Do two rows at once
    687     pmaddubsw               m13, m0, k0k1
    688     mova                     m0, m4
    689     pmaddubsw               m14, m8, k4k5
    690     pmaddubsw               m15, m4, k2k3
    691     mova                     m4, m8
    692     paddsw                  m13, m14
    693     movu                     m3, [srcq + sstrideq]          ;H
    694     lea                    srcq, [srcq + sstrideq * 2]
    695     punpcklbw               m14, m2, m3                     ;G H
    696     mova                     m8, m14
    697     pmaddubsw               m14, k6k7
    698     paddsw                  m15, m14
    699     paddsw                  m13, m15
    700     paddsw                  m13, krd
    701     psraw                   m13, 7
    702 
    703     pmaddubsw               m14, m1, k0k1
    704     pmaddubsw                m1, m9, k4k5
    705     pmaddubsw               m15, m5, k2k3
    706     paddsw                  m14, m1
    707     mova                     m1, m5
    708     mova                     m5, m9
    709     punpckhbw                m2, m3                         ;G H
    710     mova                     m9, m2
    711     pmaddubsw                m2, k6k7
    712     paddsw                  m15, m2
    713     paddsw                  m14, m15
    714     paddsw                  m14, krd
    715     psraw                   m14, 7
    716     packuswb                m13, m14
    717 %ifidn %1, v8_avg
    718     pavgb                   m13, [dstq]
    719 %endif
    720     mova                 [dstq], m13
    721 
    722     ; next iter
    723     pmaddubsw               m15, tmp0, k0k1
    724     pmaddubsw               m14, m10, k4k5
    725     pmaddubsw               m13, m6, k2k3
    726     paddsw                  m15, m14
    727     mova                   tmp0, m6
    728     mova                     m6, m10
    729     movu                     m2, [srcq]                     ;G next iter
    730     punpcklbw               m14, m3, m2                     ;G H next iter
    731     mova                    m10, m14
    732     pmaddubsw               m14, k6k7
    733     paddsw                  m13, m14
    734     paddsw                  m15, m13
    735     paddsw                  m15, krd
    736     psraw                   m15, 7
    737 
    738     pmaddubsw               m14, tmp1, k0k1
    739     mova                   tmp1, m7
    740     pmaddubsw               m13, m7, k2k3
    741     mova                     m7, m11
    742     pmaddubsw               m11, k4k5
    743     paddsw                  m14, m11
    744     punpckhbw                m3, m2                         ;G H next iter
    745     mova                    m11, m3
    746     pmaddubsw                m3, k6k7
    747     paddsw                  m13, m3
    748     paddsw                  m14, m13
    749     paddsw                  m14, krd
    750     psraw                   m14, 7
    751     packuswb                m15, m14
    752 %ifidn %1, v8_avg
    753     pavgb                   m15, [dstq + dstrideq]
    754 %endif
    755     mova      [dstq + dstrideq], m15
    756     lea                    dstq, [dstq + dstrideq * 2]
    757     sub                 heightd, 2
    758     jg                    .loop
    759 
    760     ; Do last row if output_height is odd
    761     jne                   .done
    762 
    763     movu                     m3, [srcq + sstrideq]          ;H
    764     punpcklbw                m6, m2, m3                     ;G H
    765     punpckhbw                m2, m3                         ;G H
    766     pmaddubsw                m0, k0k1
    767     pmaddubsw                m1, k0k1
    768     pmaddubsw                m4, k2k3
    769     pmaddubsw                m5, k2k3
    770     pmaddubsw                m8, k4k5
    771     pmaddubsw                m9, k4k5
    772     pmaddubsw                m6, k6k7
    773     pmaddubsw                m2, k6k7
    774     paddsw                   m0, m8
    775     paddsw                   m1, m9
    776     paddsw                   m4, m6
    777     paddsw                   m5, m2
    778     paddsw                   m0, m4
    779     paddsw                   m1, m5
    780     paddsw                   m0, krd
    781     paddsw                   m1, krd
    782     psraw                    m0, 7
    783     psraw                    m1, 7
    784     packuswb                 m0, m1
    785 %ifidn %1, v8_avg
    786     pavgb                    m0, [dstq]
    787 %endif
    788     mova                 [dstq], m0
    789 
    790 .done:
    791     REP_RET
    792 
    793 %endif ; ARCH_X86_64
    794 
    795 %endm
    796 
    797 INIT_XMM ssse3
    798 SUBPIX_VFILTER16     v8     ; vpx_filter_block1d16_v8_ssse3
    799 SUBPIX_VFILTER16 v8_avg     ; vpx_filter_block1d16_v8_avg_ssse3
    800 SUBPIX_VFILTER       v8, 8  ; vpx_filter_block1d8_v8_ssse3
    801 SUBPIX_VFILTER   v8_avg, 8  ; vpx_filter_block1d8_v8_avg_ssse3
    802 SUBPIX_VFILTER       v8, 4  ; vpx_filter_block1d4_v8_ssse3
    803 SUBPIX_VFILTER   v8_avg, 4  ; vpx_filter_block1d4_v8_avg_ssse3
    804