Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "vpx_ports/mem.h"
     12 #include "vpx_dsp/mips/loopfilter_msa.h"
     13 
     14 int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
     15                                  uint8_t *filter48,
     16                                  const uint8_t *b_limit_ptr,
     17                                  const uint8_t *limit_ptr,
     18                                  const uint8_t *thresh_ptr) {
     19   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
     20   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
     21   v16u8 flat, mask, hev, thresh, b_limit, limit;
     22   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
     23   v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
     24   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
     25   v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
     26   v16u8 zero = { 0 };
     27 
     28   /* load vector elements */
     29   LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
     30 
     31   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
     32   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
     33   limit = (v16u8)__msa_fill_b(*limit_ptr);
     34 
     35   /* mask and hev */
     36   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
     37                hev, mask, flat);
     38   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
     39   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
     40 
     41   if (__msa_test_bz_v(flat)) {
     42     ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
     43 
     44     return 1;
     45   } else {
     46     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
     47                zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
     48                q2_r, q3_r);
     49     VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
     50                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
     51 
     52     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
     53     ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
     54     VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
     55                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
     56 
     57     /* convert 16 bit output data into 8 bit */
     58     PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
     59                 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
     60                 p0_filt8_r, q0_filt8_r);
     61     PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
     62                 q2_filt8_r);
     63 
     64     /* store pixel values */
     65     p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
     66     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
     67     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
     68     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
     69     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
     70     q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
     71 
     72     ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
     73     filter48 += (4 * 16);
     74     ST_UB2(q1_out, q2_out, filter48, 16);
     75     filter48 += (2 * 16);
     76     ST_UB(flat, filter48);
     77 
     78     return 0;
     79   }
     80 }
     81 
     82 void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
     83   v16u8 flat, flat2, filter8;
     84   v16i8 zero = { 0 };
     85   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
     86   v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
     87   v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
     88   v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
     89   v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
     90   v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
     91   v8i16 l_out, r_out;
     92 
     93   flat = LD_UB(filter48 + 96);
     94 
     95   LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
     96   LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
     97   VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
     98 
     99   if (__msa_test_bz_v(flat2)) {
    100     LD_UB4(filter48, 16, p2, p1, p0, q0);
    101     LD_UB2(filter48 + 4 * 16, 16, q1, q2);
    102 
    103     src -= 3 * pitch;
    104     ST_UB4(p2, p1, p0, q0, src, pitch);
    105     src += (4 * pitch);
    106     ST_UB2(q1, q2, src, pitch);
    107   } else {
    108     src -= 7 * pitch;
    109 
    110     ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
    111                zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
    112                p2_r_in, p1_r_in, p0_r_in);
    113 
    114     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
    115 
    116     tmp0_r = p7_r_in << 3;
    117     tmp0_r -= p7_r_in;
    118     tmp0_r += p6_r_in;
    119     tmp0_r += q0_r_in;
    120     tmp1_r = p6_r_in + p5_r_in;
    121     tmp1_r += p4_r_in;
    122     tmp1_r += p3_r_in;
    123     tmp1_r += p2_r_in;
    124     tmp1_r += p1_r_in;
    125     tmp1_r += p0_r_in;
    126     tmp1_r += tmp0_r;
    127     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    128 
    129     ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
    130                p5_l_in, p4_l_in);
    131     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
    132                p1_l_in, p0_l_in);
    133     q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
    134 
    135     tmp0_l = p7_l_in << 3;
    136     tmp0_l -= p7_l_in;
    137     tmp0_l += p6_l_in;
    138     tmp0_l += q0_l_in;
    139     tmp1_l = p6_l_in + p5_l_in;
    140     tmp1_l += p4_l_in;
    141     tmp1_l += p3_l_in;
    142     tmp1_l += p2_l_in;
    143     tmp1_l += p1_l_in;
    144     tmp1_l += p0_l_in;
    145     tmp1_l += tmp0_l;
    146     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    147 
    148     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    149     p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
    150     ST_UB(p6, src);
    151     src += pitch;
    152 
    153     /* p5 */
    154     q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
    155     tmp0_r = p5_r_in - p6_r_in;
    156     tmp0_r += q1_r_in;
    157     tmp0_r -= p7_r_in;
    158     tmp1_r += tmp0_r;
    159     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    160 
    161     q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
    162     tmp0_l = p5_l_in - p6_l_in;
    163     tmp0_l += q1_l_in;
    164     tmp0_l -= p7_l_in;
    165     tmp1_l += tmp0_l;
    166     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    167 
    168     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    169     p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
    170     ST_UB(p5, src);
    171     src += pitch;
    172 
    173     /* p4 */
    174     q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
    175     tmp0_r = p4_r_in - p5_r_in;
    176     tmp0_r += q2_r_in;
    177     tmp0_r -= p7_r_in;
    178     tmp1_r += tmp0_r;
    179     r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
    180 
    181     q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
    182     tmp0_l = p4_l_in - p5_l_in;
    183     tmp0_l += q2_l_in;
    184     tmp0_l -= p7_l_in;
    185     tmp1_l += tmp0_l;
    186     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    187 
    188     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    189     p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
    190     ST_UB(p4, src);
    191     src += pitch;
    192 
    193     /* p3 */
    194     q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
    195     tmp0_r = p3_r_in - p4_r_in;
    196     tmp0_r += q3_r_in;
    197     tmp0_r -= p7_r_in;
    198     tmp1_r += tmp0_r;
    199     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    200 
    201     q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
    202     tmp0_l = p3_l_in - p4_l_in;
    203     tmp0_l += q3_l_in;
    204     tmp0_l -= p7_l_in;
    205     tmp1_l += tmp0_l;
    206     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    207 
    208     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    209     p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
    210     ST_UB(p3, src);
    211     src += pitch;
    212 
    213     /* p2 */
    214     q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
    215     filter8 = LD_UB(filter48);
    216     tmp0_r = p2_r_in - p3_r_in;
    217     tmp0_r += q4_r_in;
    218     tmp0_r -= p7_r_in;
    219     tmp1_r += tmp0_r;
    220     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    221 
    222     q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
    223     tmp0_l = p2_l_in - p3_l_in;
    224     tmp0_l += q4_l_in;
    225     tmp0_l -= p7_l_in;
    226     tmp1_l += tmp0_l;
    227     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    228 
    229     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    230     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    231     ST_UB(filter8, src);
    232     src += pitch;
    233 
    234     /* p1 */
    235     q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
    236     filter8 = LD_UB(filter48 + 16);
    237     tmp0_r = p1_r_in - p2_r_in;
    238     tmp0_r += q5_r_in;
    239     tmp0_r -= p7_r_in;
    240     tmp1_r += tmp0_r;
    241     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    242 
    243     q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
    244     tmp0_l = p1_l_in - p2_l_in;
    245     tmp0_l += q5_l_in;
    246     tmp0_l -= p7_l_in;
    247     tmp1_l += tmp0_l;
    248     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    249 
    250     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    251     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    252     ST_UB(filter8, src);
    253     src += pitch;
    254 
    255     /* p0 */
    256     q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
    257     filter8 = LD_UB(filter48 + 32);
    258     tmp0_r = p0_r_in - p1_r_in;
    259     tmp0_r += q6_r_in;
    260     tmp0_r -= p7_r_in;
    261     tmp1_r += tmp0_r;
    262     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    263 
    264     q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
    265     tmp0_l = p0_l_in - p1_l_in;
    266     tmp0_l += q6_l_in;
    267     tmp0_l -= p7_l_in;
    268     tmp1_l += tmp0_l;
    269     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    270 
    271     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    272     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    273     ST_UB(filter8, src);
    274     src += pitch;
    275 
    276     /* q0 */
    277     q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
    278     filter8 = LD_UB(filter48 + 48);
    279     tmp0_r = q7_r_in - p0_r_in;
    280     tmp0_r += q0_r_in;
    281     tmp0_r -= p7_r_in;
    282     tmp1_r += tmp0_r;
    283     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    284 
    285     q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
    286     tmp0_l = q7_l_in - p0_l_in;
    287     tmp0_l += q0_l_in;
    288     tmp0_l -= p7_l_in;
    289     tmp1_l += tmp0_l;
    290     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    291 
    292     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    293     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    294     ST_UB(filter8, src);
    295     src += pitch;
    296 
    297     /* q1 */
    298     filter8 = LD_UB(filter48 + 64);
    299     tmp0_r = q7_r_in - q0_r_in;
    300     tmp0_r += q1_r_in;
    301     tmp0_r -= p6_r_in;
    302     tmp1_r += tmp0_r;
    303     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    304 
    305     tmp0_l = q7_l_in - q0_l_in;
    306     tmp0_l += q1_l_in;
    307     tmp0_l -= p6_l_in;
    308     tmp1_l += tmp0_l;
    309     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    310 
    311     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    312     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    313     ST_UB(filter8, src);
    314     src += pitch;
    315 
    316     /* q2 */
    317     filter8 = LD_UB(filter48 + 80);
    318     tmp0_r = q7_r_in - q1_r_in;
    319     tmp0_r += q2_r_in;
    320     tmp0_r -= p5_r_in;
    321     tmp1_r += tmp0_r;
    322     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    323 
    324     tmp0_l = q7_l_in - q1_l_in;
    325     tmp0_l += q2_l_in;
    326     tmp0_l -= p5_l_in;
    327     tmp1_l += tmp0_l;
    328     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    329 
    330     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    331     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    332     ST_UB(filter8, src);
    333     src += pitch;
    334 
    335     /* q3 */
    336     tmp0_r = q7_r_in - q2_r_in;
    337     tmp0_r += q3_r_in;
    338     tmp0_r -= p4_r_in;
    339     tmp1_r += tmp0_r;
    340     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    341 
    342     tmp0_l = q7_l_in - q2_l_in;
    343     tmp0_l += q3_l_in;
    344     tmp0_l -= p4_l_in;
    345     tmp1_l += tmp0_l;
    346     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    347 
    348     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    349     q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
    350     ST_UB(q3, src);
    351     src += pitch;
    352 
    353     /* q4 */
    354     tmp0_r = q7_r_in - q3_r_in;
    355     tmp0_r += q4_r_in;
    356     tmp0_r -= p3_r_in;
    357     tmp1_r += tmp0_r;
    358     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    359 
    360     tmp0_l = q7_l_in - q3_l_in;
    361     tmp0_l += q4_l_in;
    362     tmp0_l -= p3_l_in;
    363     tmp1_l += tmp0_l;
    364     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    365 
    366     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    367     q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
    368     ST_UB(q4, src);
    369     src += pitch;
    370 
    371     /* q5 */
    372     tmp0_r = q7_r_in - q4_r_in;
    373     tmp0_r += q5_r_in;
    374     tmp0_r -= p2_r_in;
    375     tmp1_r += tmp0_r;
    376     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    377 
    378     tmp0_l = q7_l_in - q4_l_in;
    379     tmp0_l += q5_l_in;
    380     tmp0_l -= p2_l_in;
    381     tmp1_l += tmp0_l;
    382     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    383 
    384     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    385     q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
    386     ST_UB(q5, src);
    387     src += pitch;
    388 
    389     /* q6 */
    390     tmp0_r = q7_r_in - q5_r_in;
    391     tmp0_r += q6_r_in;
    392     tmp0_r -= p1_r_in;
    393     tmp1_r += tmp0_r;
    394     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    395 
    396     tmp0_l = q7_l_in - q5_l_in;
    397     tmp0_l += q6_l_in;
    398     tmp0_l -= p1_l_in;
    399     tmp1_l += tmp0_l;
    400     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    401 
    402     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    403     q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
    404     ST_UB(q6, src);
    405   }
    406 }
    407 
    408 void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
    409                                     const uint8_t *b_limit_ptr,
    410                                     const uint8_t *limit_ptr,
    411                                     const uint8_t *thresh_ptr,
    412                                     int32_t count) {
    413   DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
    414   uint8_t early_exit = 0;
    415 
    416   (void)count;
    417 
    418   early_exit = vpx_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
    419                                         limit_ptr, thresh_ptr);
    420 
    421   if (0 == early_exit) {
    422     vpx_hz_lpf_t16_16w(src, pitch, filter48);
    423   }
    424 }
    425 
    426 void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
    427                                const uint8_t *b_limit_ptr,
    428                                const uint8_t *limit_ptr,
    429                                const uint8_t *thresh_ptr,
    430                                int32_t count) {
    431   if (1 == count) {
    432     uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
    433     uint64_t dword0, dword1;
    434     v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
    435     v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
    436     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
    437     v16u8 p0_filter16, p1_filter16;
    438     v8i16 p2_filter8, p1_filter8, p0_filter8;
    439     v8i16 q0_filter8, q1_filter8, q2_filter8;
    440     v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
    441     v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
    442     v16i8 zero = { 0 };
    443     v8u16 tmp0, tmp1, tmp2;
    444 
    445     /* load vector elements */
    446     LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
    447 
    448     thresh = (v16u8)__msa_fill_b(*thresh_ptr);
    449     b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
    450     limit = (v16u8)__msa_fill_b(*limit_ptr);
    451 
    452     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
    453                  hev, mask, flat);
    454     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
    455     VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
    456                        q1_out);
    457 
    458     flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
    459 
    460     if (__msa_test_bz_v(flat)) {
    461       p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
    462       p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
    463       q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
    464       q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
    465       SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
    466     } else {
    467       /* convert 8 bit input data into 16 bit */
    468       ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
    469                  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
    470                  q3_r);
    471       VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
    472                   p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
    473 
    474       /* convert 16 bit output data into 8 bit */
    475       PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
    476                   zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
    477                   q0_filter8);
    478       PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
    479 
    480       /* store pixel values */
    481       p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
    482       p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
    483       p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
    484       q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
    485       q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
    486       q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
    487 
    488       /* load 16 vector elements */
    489       LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
    490       LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
    491 
    492       VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
    493 
    494       if (__msa_test_bz_v(flat2)) {
    495         p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
    496         p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
    497         p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
    498         q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
    499         q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
    500         q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
    501 
    502         SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
    503         SD(q1_d, src + pitch);
    504         SD(q2_d, src + 2 * pitch);
    505       } else {
    506         /* LSB(right) 8 pixel operation */
    507         ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
    508                    zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
    509                    q7_r);
    510 
    511         tmp0 = p7_r << 3;
    512         tmp0 -= p7_r;
    513         tmp0 += p6_r;
    514         tmp0 += q0_r;
    515 
    516         src -= 7 * pitch;
    517 
    518         /* calculation of p6 and p5 */
    519         tmp1 = p6_r + p5_r + p4_r + p3_r;
    520         tmp1 += (p2_r + p1_r + p0_r);
    521         tmp1 += tmp0;
    522         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    523         tmp0 = p5_r - p6_r + q1_r - p7_r;
    524         tmp1 += tmp0;
    525         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    526         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
    527                     p1_filter16);
    528         p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
    529         p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
    530         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
    531         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
    532         SD(dword0, src);
    533         src += pitch;
    534         SD(dword1, src);
    535         src += pitch;
    536 
    537         /* calculation of p4 and p3 */
    538         tmp0 = p4_r - p5_r + q2_r - p7_r;
    539         tmp2 = p3_r - p4_r + q3_r - p7_r;
    540         tmp1 += tmp0;
    541         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    542         tmp1 += tmp2;
    543         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    544         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
    545                     p1_filter16);
    546         p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
    547         p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
    548         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
    549         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
    550         SD(dword0, src);
    551         src += pitch;
    552         SD(dword1, src);
    553         src += pitch;
    554 
    555         /* calculation of p2 and p1 */
    556         tmp0 = p2_r - p3_r + q4_r - p7_r;
    557         tmp2 = p1_r - p2_r + q5_r - p7_r;
    558         tmp1 += tmp0;
    559         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    560         tmp1 += tmp2;
    561         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    562         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
    563                     p1_filter16);
    564         p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
    565         p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
    566         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
    567         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
    568         SD(dword0, src);
    569         src += pitch;
    570         SD(dword1, src);
    571         src += pitch;
    572 
    573         /* calculation of p0 and q0 */
    574         tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
    575         tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
    576         tmp1 += tmp0;
    577         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    578         tmp1 += tmp2;
    579         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    580         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
    581                     p1_filter16);
    582         p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
    583         p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
    584         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
    585         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
    586         SD(dword0, src);
    587         src += pitch;
    588         SD(dword1, src);
    589         src += pitch;
    590 
    591         /* calculation of q1 and q2 */
    592         tmp0 = q7_r - q0_r + q1_r - p6_r;
    593         tmp2 = q7_r - q1_r + q2_r - p5_r;
    594         tmp1 += tmp0;
    595         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    596         tmp1 += tmp2;
    597         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    598         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
    599                     p1_filter16);
    600         p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
    601         p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
    602         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
    603         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
    604         SD(dword0, src);
    605         src += pitch;
    606         SD(dword1, src);
    607         src += pitch;
    608 
    609         /* calculation of q3 and q4 */
    610         tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
    611         tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
    612         tmp1 += tmp0;
    613         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    614         tmp1 += tmp2;
    615         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    616         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
    617                     p1_filter16);
    618         p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
    619         p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
    620         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
    621         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
    622         SD(dword0, src);
    623         src += pitch;
    624         SD(dword1, src);
    625         src += pitch;
    626 
    627         /* calculation of q5 and q6 */
    628         tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
    629         tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
    630         tmp1 += tmp0;
    631         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    632         tmp1 += tmp2;
    633         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    634         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
    635                     p1_filter16);
    636         p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
    637         p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
    638         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
    639         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
    640         SD(dword0, src);
    641         src += pitch;
    642         SD(dword1, src);
    643       }
    644     }
    645   } else {
    646     vpx_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,
    647                                    thresh_ptr, count);
    648   }
    649 }
    650 
    651 static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
    652                                    uint8_t *output, int32_t out_pitch) {
    653   v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
    654   v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    655   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
    656 
    657   LD_UB8(input, in_pitch,
    658          p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
    659   /* 8x8 transpose */
    660   TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
    661                      p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
    662   /* 8x8 transpose */
    663   ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
    664              tmp0, tmp1, tmp2, tmp3);
    665   ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
    666   ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
    667   ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
    668   ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
    669   SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
    670 
    671   ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
    672   output += (8 * out_pitch);
    673   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
    674 }
    675 
    676 static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
    677                                    uint8_t *output, int32_t out_pitch) {
    678   v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
    679   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
    680 
    681   LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
    682   LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
    683   TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
    684                       q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
    685   ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
    686 }
    687 
    688 static void transpose_16x16(uint8_t *input, int32_t in_pitch,
    689                             uint8_t *output, int32_t out_pitch) {
    690   v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
    691   v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
    692   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
    693   v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
    694   v4i32 tmp2, tmp3;
    695 
    696   LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
    697   input += (8 * in_pitch);
    698   LD_UB8(input, in_pitch,
    699          row8, row9, row10, row11, row12, row13, row14, row15);
    700 
    701   TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
    702                       row8, row9, row10, row11, row12, row13, row14, row15,
    703                       p7, p6, p5, p4, p3, p2, p1, p0);
    704 
    705   /* transpose 16x8 matrix into 8x16 */
    706   /* total 8 intermediate register and 32 instructions */
    707   q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
    708   q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
    709   q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
    710   q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
    711   q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
    712   q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
    713   q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
    714   q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
    715 
    716   ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
    717   tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
    718   tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
    719 
    720   ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
    721   tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
    722   tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
    723 
    724   ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
    725   q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
    726   q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
    727 
    728   tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
    729   tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
    730   q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
    731   q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
    732 
    733   ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
    734   q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
    735   q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
    736 
    737   tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
    738   tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
    739   q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
    740   q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
    741 
    742   ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
    743   output += (8 * out_pitch);
    744   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
    745 }
    746 
    747 int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
    748                                 uint8_t *src_org, int32_t pitch_org,
    749                                 const uint8_t *b_limit_ptr,
    750                                 const uint8_t *limit_ptr,
    751                                 const uint8_t *thresh_ptr) {
    752   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    753   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
    754   v16u8 flat, mask, hev, thresh, b_limit, limit;
    755   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
    756   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
    757   v16i8 zero = { 0 };
    758   v8i16 vec0, vec1, vec2, vec3;
    759 
    760   /* load vector elements */
    761   LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
    762 
    763   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
    764   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
    765   limit = (v16u8)__msa_fill_b(*limit_ptr);
    766 
    767   /* mask and hev */
    768   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
    769                hev, mask, flat);
    770   /* flat4 */
    771   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
    772   /* filter4 */
    773   VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
    774 
    775   flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
    776 
    777   if (__msa_test_bz_v(flat)) {
    778     ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
    779     ILVRL_H2_SH(vec1, vec0, vec2, vec3);
    780     ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
    781     return 1;
    782   } else {
    783     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
    784                zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
    785                q3_r);
    786     VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
    787                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
    788 
    789     /* convert 16 bit output data into 8 bit */
    790     p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
    791     p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
    792     p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
    793     q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
    794     q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
    795     q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
    796 
    797     /* store pixel values */
    798     p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
    799     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
    800     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
    801     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
    802     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
    803     q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
    804 
    805     ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
    806     filter48 += (4 * 16);
    807     ST_UB2(q1_out, q2_out, filter48, 16);
    808     filter48 += (2 * 16);
    809     ST_UB(flat, filter48);
    810 
    811     return 0;
    812   }
    813 }
    814 
    815 int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
    816                           uint8_t *filter48) {
    817   v16i8 zero = { 0 };
    818   v16u8 filter8, flat, flat2;
    819   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
    820   v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
    821   v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
    822   v8u16 tmp0_r, tmp1_r;
    823   v8i16 r_out;
    824 
    825   flat = LD_UB(filter48 + 6 * 16);
    826 
    827   LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
    828   LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
    829 
    830   VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
    831 
    832   if (__msa_test_bz_v(flat2)) {
    833     v8i16 vec0, vec1, vec2, vec3, vec4;
    834 
    835     LD_UB4(filter48, 16, p2, p1, p0, q0);
    836     LD_UB2(filter48 + 4 * 16, 16, q1, q2);
    837 
    838     ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
    839     ILVRL_H2_SH(vec1, vec0, vec3, vec4);
    840     vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
    841 
    842     src_org -= 3;
    843     ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
    844     ST2x4_UB(vec2, 0, (src_org + 4), pitch);
    845     src_org += (4 * pitch);
    846     ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
    847     ST2x4_UB(vec2, 4, (src_org + 4), pitch);
    848 
    849     return 1;
    850   } else {
    851     src -= 7 * 16;
    852 
    853     ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
    854                zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
    855                p3_r_in, p2_r_in, p1_r_in, p0_r_in);
    856     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
    857 
    858     tmp0_r = p7_r_in << 3;
    859     tmp0_r -= p7_r_in;
    860     tmp0_r += p6_r_in;
    861     tmp0_r += q0_r_in;
    862     tmp1_r = p6_r_in + p5_r_in;
    863     tmp1_r += p4_r_in;
    864     tmp1_r += p3_r_in;
    865     tmp1_r += p2_r_in;
    866     tmp1_r += p1_r_in;
    867     tmp1_r += p0_r_in;
    868     tmp1_r += tmp0_r;
    869 
    870     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    871     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    872     p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
    873     ST8x1_UB(p6, src);
    874     src += 16;
    875 
    876     /* p5 */
    877     q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
    878     tmp0_r = p5_r_in - p6_r_in;
    879     tmp0_r += q1_r_in;
    880     tmp0_r -= p7_r_in;
    881     tmp1_r += tmp0_r;
    882     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    883     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    884     p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
    885     ST8x1_UB(p5, src);
    886     src += 16;
    887 
    888     /* p4 */
    889     q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
    890     tmp0_r = p4_r_in - p5_r_in;
    891     tmp0_r += q2_r_in;
    892     tmp0_r -= p7_r_in;
    893     tmp1_r += tmp0_r;
    894     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    895     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    896     p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
    897     ST8x1_UB(p4, src);
    898     src += 16;
    899 
    900     /* p3 */
    901     q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
    902     tmp0_r = p3_r_in - p4_r_in;
    903     tmp0_r += q3_r_in;
    904     tmp0_r -= p7_r_in;
    905     tmp1_r += tmp0_r;
    906     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    907     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    908     p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
    909     ST8x1_UB(p3, src);
    910     src += 16;
    911 
    912     /* p2 */
    913     q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
    914     filter8 = LD_UB(filter48);
    915     tmp0_r = p2_r_in - p3_r_in;
    916     tmp0_r += q4_r_in;
    917     tmp0_r -= p7_r_in;
    918     tmp1_r += tmp0_r;
    919     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    920     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    921     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    922     ST8x1_UB(filter8, src);
    923     src += 16;
    924 
    925     /* p1 */
    926     q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
    927     filter8 = LD_UB(filter48 + 16);
    928     tmp0_r = p1_r_in - p2_r_in;
    929     tmp0_r += q5_r_in;
    930     tmp0_r -= p7_r_in;
    931     tmp1_r += tmp0_r;
    932     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    933     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    934     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    935     ST8x1_UB(filter8, src);
    936     src += 16;
    937 
    938     /* p0 */
    939     q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
    940     filter8 = LD_UB(filter48 + 32);
    941     tmp0_r = p0_r_in - p1_r_in;
    942     tmp0_r += q6_r_in;
    943     tmp0_r -= p7_r_in;
    944     tmp1_r += tmp0_r;
    945     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    946     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    947     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    948     ST8x1_UB(filter8, src);
    949     src += 16;
    950 
    951     /* q0 */
    952     q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
    953     filter8 = LD_UB(filter48 + 48);
    954     tmp0_r = q7_r_in - p0_r_in;
    955     tmp0_r += q0_r_in;
    956     tmp0_r -= p7_r_in;
    957     tmp1_r += tmp0_r;
    958     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    959     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    960     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    961     ST8x1_UB(filter8, src);
    962     src += 16;
    963 
    964     /* q1 */
    965     filter8 = LD_UB(filter48 + 64);
    966     tmp0_r = q7_r_in - q0_r_in;
    967     tmp0_r += q1_r_in;
    968     tmp0_r -= p6_r_in;
    969     tmp1_r += tmp0_r;
    970     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    971     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    972     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    973     ST8x1_UB(filter8, src);
    974     src += 16;
    975 
    976     /* q2 */
    977     filter8 = LD_UB(filter48 + 80);
    978     tmp0_r = q7_r_in - q1_r_in;
    979     tmp0_r += q2_r_in;
    980     tmp0_r -= p5_r_in;
    981     tmp1_r += tmp0_r;
    982     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    983     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    984     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    985     ST8x1_UB(filter8, src);
    986     src += 16;
    987 
    988     /* q3 */
    989     tmp0_r = q7_r_in - q2_r_in;
    990     tmp0_r += q3_r_in;
    991     tmp0_r -= p4_r_in;
    992     tmp1_r += tmp0_r;
    993     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    994     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    995     q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
    996     ST8x1_UB(q3, src);
    997     src += 16;
    998 
    999     /* q4 */
   1000     tmp0_r = q7_r_in - q3_r_in;
   1001     tmp0_r += q4_r_in;
   1002     tmp0_r -= p3_r_in;
   1003     tmp1_r += tmp0_r;
   1004     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1005     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
   1006     q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
   1007     ST8x1_UB(q4, src);
   1008     src += 16;
   1009 
   1010     /* q5 */
   1011     tmp0_r = q7_r_in - q4_r_in;
   1012     tmp0_r += q5_r_in;
   1013     tmp0_r -= p2_r_in;
   1014     tmp1_r += tmp0_r;
   1015     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1016     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
   1017     q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
   1018     ST8x1_UB(q5, src);
   1019     src += 16;
   1020 
   1021     /* q6 */
   1022     tmp0_r = q7_r_in - q5_r_in;
   1023     tmp0_r += q6_r_in;
   1024     tmp0_r -= p1_r_in;
   1025     tmp1_r += tmp0_r;
   1026     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1027     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
   1028     q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
   1029     ST8x1_UB(q6, src);
   1030 
   1031     return 0;
   1032   }
   1033 }
   1034 
   1035 void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
   1036                              const uint8_t *b_limit_ptr,
   1037                              const uint8_t *limit_ptr,
   1038                              const uint8_t *thresh_ptr) {
   1039   uint8_t early_exit = 0;
   1040   DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
   1041   uint8_t *filter48 = &transposed_input[16 * 16];
   1042 
   1043   transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
   1044 
   1045   early_exit = vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
   1046                                        &filter48[0], src, pitch, b_limit_ptr,
   1047                                        limit_ptr, thresh_ptr);
   1048 
   1049   if (0 == early_exit) {
   1050     early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
   1051                                    &filter48[0]);
   1052 
   1053     if (0 == early_exit) {
   1054       transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
   1055     }
   1056   }
   1057 }
   1058 
   1059 int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
   1060                                  uint8_t *src_org, int32_t pitch,
   1061                                  const uint8_t *b_limit_ptr,
   1062                                  const uint8_t *limit_ptr,
   1063                                  const uint8_t *thresh_ptr) {
   1064   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
   1065   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
   1066   v16u8 flat, mask, hev, thresh, b_limit, limit;
   1067   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
   1068   v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
   1069   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
   1070   v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
   1071   v16i8 zero = { 0 };
   1072   v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
   1073 
   1074   /* load vector elements */
   1075   LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
   1076 
   1077   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
   1078   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
   1079   limit = (v16u8)__msa_fill_b(*limit_ptr);
   1080 
   1081   /* mask and hev */
   1082   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
   1083                hev, mask, flat);
   1084   /* flat4 */
   1085   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
   1086   /* filter4 */
   1087   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
   1088 
   1089   if (__msa_test_bz_v(flat)) {
   1090     ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
   1091     ILVRL_H2_SH(vec1, vec0, vec2, vec3);
   1092     ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
   1093     ILVRL_H2_SH(vec1, vec0, vec4, vec5);
   1094 
   1095     src_org -= 2;
   1096     ST4x8_UB(vec2, vec3, src_org, pitch);
   1097     src_org += 8 * pitch;
   1098     ST4x8_UB(vec4, vec5, src_org, pitch);
   1099 
   1100     return 1;
   1101   } else {
   1102     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
   1103                zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
   1104                q3_r);
   1105     VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
   1106                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
   1107     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
   1108     ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
   1109     VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
   1110                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
   1111 
   1112     /* convert 16 bit output data into 8 bit */
   1113     PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
   1114                 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
   1115                 p0_filt8_r, q0_filt8_r);
   1116     PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
   1117                 q2_filt8_r);
   1118 
   1119     /* store pixel values */
   1120     p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
   1121     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
   1122     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
   1123     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
   1124     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
   1125     q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
   1126 
   1127     ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
   1128     filter48 += (4 * 16);
   1129     ST_UB2(q1_out, q2_out, filter48, 16);
   1130     filter48 += (2 * 16);
   1131     ST_UB(flat, filter48);
   1132 
   1133     return 0;
   1134   }
   1135 }
   1136 
   1137 int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
   1138                            uint8_t *filter48) {
   1139   v16u8 flat, flat2, filter8;
   1140   v16i8 zero = { 0 };
   1141   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
   1142   v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
   1143   v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
   1144   v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
   1145   v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
   1146   v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
   1147   v8i16 l_out, r_out;
   1148 
   1149   flat = LD_UB(filter48 + 6 * 16);
   1150 
   1151   LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
   1152   LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
   1153 
   1154   VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
   1155 
   1156   if (__msa_test_bz_v(flat2)) {
   1157     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
   1158 
   1159     LD_UB4(filter48, 16, p2, p1, p0, q0);
   1160     LD_UB2(filter48 + 4 * 16, 16, q1, q2);
   1161 
   1162     ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
   1163     ILVRL_H2_SH(vec1, vec0, vec3, vec4);
   1164     ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
   1165     ILVRL_H2_SH(vec1, vec0, vec6, vec7);
   1166     ILVRL_B2_SH(q2, q1, vec2, vec5);
   1167 
   1168     src_org -= 3;
   1169     ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
   1170     ST2x4_UB(vec2, 0, (src_org + 4), pitch);
   1171     src_org += (4 * pitch);
   1172     ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
   1173     ST2x4_UB(vec2, 4, (src_org + 4), pitch);
   1174     src_org += (4 * pitch);
   1175     ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
   1176     ST2x4_UB(vec5, 0, (src_org + 4), pitch);
   1177     src_org += (4 * pitch);
   1178     ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
   1179     ST2x4_UB(vec5, 4, (src_org + 4), pitch);
   1180 
   1181     return 1;
   1182   } else {
   1183     src -= 7 * 16;
   1184 
   1185     ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
   1186                zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
   1187                p3_r_in, p2_r_in, p1_r_in, p0_r_in);
   1188     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
   1189 
   1190     tmp0_r = p7_r_in << 3;
   1191     tmp0_r -= p7_r_in;
   1192     tmp0_r += p6_r_in;
   1193     tmp0_r += q0_r_in;
   1194     tmp1_r = p6_r_in + p5_r_in;
   1195     tmp1_r += p4_r_in;
   1196     tmp1_r += p3_r_in;
   1197     tmp1_r += p2_r_in;
   1198     tmp1_r += p1_r_in;
   1199     tmp1_r += p0_r_in;
   1200     tmp1_r += tmp0_r;
   1201     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1202 
   1203     ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
   1204                p5_l_in, p4_l_in);
   1205     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
   1206                p1_l_in, p0_l_in);
   1207     q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
   1208 
   1209     tmp0_l = p7_l_in << 3;
   1210     tmp0_l -= p7_l_in;
   1211     tmp0_l += p6_l_in;
   1212     tmp0_l += q0_l_in;
   1213     tmp1_l = p6_l_in + p5_l_in;
   1214     tmp1_l += p4_l_in;
   1215     tmp1_l += p3_l_in;
   1216     tmp1_l += p2_l_in;
   1217     tmp1_l += p1_l_in;
   1218     tmp1_l += p0_l_in;
   1219     tmp1_l += tmp0_l;
   1220     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1221 
   1222     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1223     p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
   1224     ST_UB(p6, src);
   1225     src += 16;
   1226 
   1227     /* p5 */
   1228     q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
   1229     tmp0_r = p5_r_in - p6_r_in;
   1230     tmp0_r += q1_r_in;
   1231     tmp0_r -= p7_r_in;
   1232     tmp1_r += tmp0_r;
   1233     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1234     q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
   1235     tmp0_l = p5_l_in - p6_l_in;
   1236     tmp0_l += q1_l_in;
   1237     tmp0_l -= p7_l_in;
   1238     tmp1_l += tmp0_l;
   1239     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1240     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1241     p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
   1242     ST_UB(p5, src);
   1243     src += 16;
   1244 
   1245     /* p4 */
   1246     q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
   1247     tmp0_r = p4_r_in - p5_r_in;
   1248     tmp0_r += q2_r_in;
   1249     tmp0_r -= p7_r_in;
   1250     tmp1_r += tmp0_r;
   1251     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1252     q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
   1253     tmp0_l = p4_l_in - p5_l_in;
   1254     tmp0_l += q2_l_in;
   1255     tmp0_l -= p7_l_in;
   1256     tmp1_l += tmp0_l;
   1257     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1258     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1259     p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
   1260     ST_UB(p4, src);
   1261     src += 16;
   1262 
   1263     /* p3 */
   1264     q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
   1265     tmp0_r = p3_r_in - p4_r_in;
   1266     tmp0_r += q3_r_in;
   1267     tmp0_r -= p7_r_in;
   1268     tmp1_r += tmp0_r;
   1269     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1270     q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
   1271     tmp0_l = p3_l_in - p4_l_in;
   1272     tmp0_l += q3_l_in;
   1273     tmp0_l -= p7_l_in;
   1274     tmp1_l += tmp0_l;
   1275     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1276     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1277     p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
   1278     ST_UB(p3, src);
   1279     src += 16;
   1280 
   1281     /* p2 */
   1282     q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
   1283     filter8 = LD_UB(filter48);
   1284     tmp0_r = p2_r_in - p3_r_in;
   1285     tmp0_r += q4_r_in;
   1286     tmp0_r -= p7_r_in;
   1287     tmp1_r += tmp0_r;
   1288     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1289     q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
   1290     tmp0_l = p2_l_in - p3_l_in;
   1291     tmp0_l += q4_l_in;
   1292     tmp0_l -= p7_l_in;
   1293     tmp1_l += tmp0_l;
   1294     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1295     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1296     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
   1297     ST_UB(filter8, src);
   1298     src += 16;
   1299 
   1300     /* p1 */
   1301     q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
   1302     filter8 = LD_UB(filter48 + 16);
   1303     tmp0_r = p1_r_in - p2_r_in;
   1304     tmp0_r += q5_r_in;
   1305     tmp0_r -= p7_r_in;
   1306     tmp1_r += tmp0_r;
   1307     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1308     q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
   1309     tmp0_l = p1_l_in - p2_l_in;
   1310     tmp0_l += q5_l_in;
   1311     tmp0_l -= p7_l_in;
   1312     tmp1_l += tmp0_l;
   1313     l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
   1314     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1315     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
   1316     ST_UB(filter8, src);
   1317     src += 16;
   1318 
   1319     /* p0 */
   1320     q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
   1321     filter8 = LD_UB(filter48 + 32);
   1322     tmp0_r = p0_r_in - p1_r_in;
   1323     tmp0_r += q6_r_in;
   1324     tmp0_r -= p7_r_in;
   1325     tmp1_r += tmp0_r;
   1326     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1327     q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
   1328     tmp0_l = p0_l_in - p1_l_in;
   1329     tmp0_l += q6_l_in;
   1330     tmp0_l -= p7_l_in;
   1331     tmp1_l += tmp0_l;
   1332     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1333     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1334     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
   1335     ST_UB(filter8, src);
   1336     src += 16;
   1337 
   1338     /* q0 */
   1339     q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
   1340     filter8 = LD_UB(filter48 + 48);
   1341     tmp0_r = q7_r_in - p0_r_in;
   1342     tmp0_r += q0_r_in;
   1343     tmp0_r -= p7_r_in;
   1344     tmp1_r += tmp0_r;
   1345     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1346     q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
   1347     tmp0_l = q7_l_in - p0_l_in;
   1348     tmp0_l += q0_l_in;
   1349     tmp0_l -= p7_l_in;
   1350     tmp1_l += tmp0_l;
   1351     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1352     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1353     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
   1354     ST_UB(filter8, src);
   1355     src += 16;
   1356 
   1357     /* q1 */
   1358     filter8 = LD_UB(filter48 + 64);
   1359     tmp0_r = q7_r_in - q0_r_in;
   1360     tmp0_r += q1_r_in;
   1361     tmp0_r -= p6_r_in;
   1362     tmp1_r += tmp0_r;
   1363     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1364     tmp0_l = q7_l_in - q0_l_in;
   1365     tmp0_l += q1_l_in;
   1366     tmp0_l -= p6_l_in;
   1367     tmp1_l += tmp0_l;
   1368     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1369     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1370     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
   1371     ST_UB(filter8, src);
   1372     src += 16;
   1373 
   1374     /* q2 */
   1375     filter8 = LD_UB(filter48 + 80);
   1376     tmp0_r = q7_r_in - q1_r_in;
   1377     tmp0_r += q2_r_in;
   1378     tmp0_r -= p5_r_in;
   1379     tmp1_r += tmp0_r;
   1380     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1381     tmp0_l = q7_l_in - q1_l_in;
   1382     tmp0_l += q2_l_in;
   1383     tmp0_l -= p5_l_in;
   1384     tmp1_l += tmp0_l;
   1385     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1386     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1387     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
   1388     ST_UB(filter8, src);
   1389     src += 16;
   1390 
   1391     /* q3 */
   1392     tmp0_r = q7_r_in - q2_r_in;
   1393     tmp0_r += q3_r_in;
   1394     tmp0_r -= p4_r_in;
   1395     tmp1_r += tmp0_r;
   1396     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1397     tmp0_l = q7_l_in - q2_l_in;
   1398     tmp0_l += q3_l_in;
   1399     tmp0_l -= p4_l_in;
   1400     tmp1_l += tmp0_l;
   1401     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1402     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1403     q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
   1404     ST_UB(q3, src);
   1405     src += 16;
   1406 
   1407     /* q4 */
   1408     tmp0_r = q7_r_in - q3_r_in;
   1409     tmp0_r += q4_r_in;
   1410     tmp0_r -= p3_r_in;
   1411     tmp1_r += tmp0_r;
   1412     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1413     tmp0_l = q7_l_in - q3_l_in;
   1414     tmp0_l += q4_l_in;
   1415     tmp0_l -= p3_l_in;
   1416     tmp1_l += tmp0_l;
   1417     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1418     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1419     q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
   1420     ST_UB(q4, src);
   1421     src += 16;
   1422 
   1423     /* q5 */
   1424     tmp0_r = q7_r_in - q4_r_in;
   1425     tmp0_r += q5_r_in;
   1426     tmp0_r -= p2_r_in;
   1427     tmp1_r += tmp0_r;
   1428     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1429     tmp0_l = q7_l_in - q4_l_in;
   1430     tmp0_l += q5_l_in;
   1431     tmp0_l -= p2_l_in;
   1432     tmp1_l += tmp0_l;
   1433     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1434     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1435     q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
   1436     ST_UB(q5, src);
   1437     src += 16;
   1438 
   1439     /* q6 */
   1440     tmp0_r = q7_r_in - q5_r_in;
   1441     tmp0_r += q6_r_in;
   1442     tmp0_r -= p1_r_in;
   1443     tmp1_r += tmp0_r;
   1444     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1445     tmp0_l = q7_l_in - q5_l_in;
   1446     tmp0_l += q6_l_in;
   1447     tmp0_l -= p1_l_in;
   1448     tmp1_l += tmp0_l;
   1449     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1450     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1451     q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
   1452     ST_UB(q6, src);
   1453 
   1454     return 0;
   1455   }
   1456 }
   1457 
   1458 void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
   1459                                   const uint8_t *b_limit_ptr,
   1460                                   const uint8_t *limit_ptr,
   1461                                   const uint8_t *thresh_ptr) {
   1462   uint8_t early_exit = 0;
   1463   DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
   1464   uint8_t *filter48 = &transposed_input[16 * 16];
   1465 
   1466   transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
   1467 
   1468   early_exit = vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
   1469                                         &filter48[0], src, pitch, b_limit_ptr,
   1470                                         limit_ptr, thresh_ptr);
   1471 
   1472   if (0 == early_exit) {
   1473     early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
   1474                                     &filter48[0]);
   1475 
   1476     if (0 == early_exit) {
   1477       transpose_16x16(transposed_input, 16, (src - 8), pitch);
   1478     }
   1479   }
   1480 }
   1481