Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "vpx_ports/mem.h"
     12 #include "vpx_dsp/mips/loopfilter_msa.h"
     13 
     14 int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
     15                                  const uint8_t *b_limit_ptr,
     16                                  const uint8_t *limit_ptr,
     17                                  const uint8_t *thresh_ptr) {
     18   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
     19   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
     20   v16u8 flat, mask, hev, thresh, b_limit, limit;
     21   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
     22   v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
     23   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
     24   v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
     25   v16u8 zero = { 0 };
     26 
     27   /* load vector elements */
     28   LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
     29 
     30   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
     31   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
     32   limit = (v16u8)__msa_fill_b(*limit_ptr);
     33 
     34   /* mask and hev */
     35   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
     36                mask, flat);
     37   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
     38   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
     39 
     40   if (__msa_test_bz_v(flat)) {
     41     ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
     42 
     43     return 1;
     44   } else {
     45     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
     46                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
     47     VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
     48                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
     49 
     50     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
     51     ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
     52     VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
     53                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
     54 
     55     /* convert 16 bit output data into 8 bit */
     56     PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
     57                 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
     58                 p0_filt8_r, q0_filt8_r);
     59     PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
     60                 q2_filt8_r);
     61 
     62     /* store pixel values */
     63     p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
     64     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
     65     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
     66     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
     67     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
     68     q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
     69 
     70     ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
     71     filter48 += (4 * 16);
     72     ST_UB2(q1_out, q2_out, filter48, 16);
     73     filter48 += (2 * 16);
     74     ST_UB(flat, filter48);
     75 
     76     return 0;
     77   }
     78 }
     79 
     80 void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
     81   v16u8 flat, flat2, filter8;
     82   v16i8 zero = { 0 };
     83   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
     84   v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
     85   v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
     86   v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
     87   v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
     88   v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
     89   v8i16 l_out, r_out;
     90 
     91   flat = LD_UB(filter48 + 96);
     92 
     93   LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
     94   LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
     95   VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
     96 
     97   if (__msa_test_bz_v(flat2)) {
     98     LD_UB4(filter48, 16, p2, p1, p0, q0);
     99     LD_UB2(filter48 + 4 * 16, 16, q1, q2);
    100 
    101     src -= 3 * pitch;
    102     ST_UB4(p2, p1, p0, q0, src, pitch);
    103     src += (4 * pitch);
    104     ST_UB2(q1, q2, src, pitch);
    105   } else {
    106     src -= 7 * pitch;
    107 
    108     ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
    109                p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
    110                p2_r_in, p1_r_in, p0_r_in);
    111 
    112     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
    113 
    114     tmp0_r = p7_r_in << 3;
    115     tmp0_r -= p7_r_in;
    116     tmp0_r += p6_r_in;
    117     tmp0_r += q0_r_in;
    118     tmp1_r = p6_r_in + p5_r_in;
    119     tmp1_r += p4_r_in;
    120     tmp1_r += p3_r_in;
    121     tmp1_r += p2_r_in;
    122     tmp1_r += p1_r_in;
    123     tmp1_r += p0_r_in;
    124     tmp1_r += tmp0_r;
    125     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    126 
    127     ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
    128                p5_l_in, p4_l_in);
    129     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
    130                p1_l_in, p0_l_in);
    131     q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
    132 
    133     tmp0_l = p7_l_in << 3;
    134     tmp0_l -= p7_l_in;
    135     tmp0_l += p6_l_in;
    136     tmp0_l += q0_l_in;
    137     tmp1_l = p6_l_in + p5_l_in;
    138     tmp1_l += p4_l_in;
    139     tmp1_l += p3_l_in;
    140     tmp1_l += p2_l_in;
    141     tmp1_l += p1_l_in;
    142     tmp1_l += p0_l_in;
    143     tmp1_l += tmp0_l;
    144     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    145 
    146     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    147     p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
    148     ST_UB(p6, src);
    149     src += pitch;
    150 
    151     /* p5 */
    152     q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
    153     tmp0_r = p5_r_in - p6_r_in;
    154     tmp0_r += q1_r_in;
    155     tmp0_r -= p7_r_in;
    156     tmp1_r += tmp0_r;
    157     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    158 
    159     q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
    160     tmp0_l = p5_l_in - p6_l_in;
    161     tmp0_l += q1_l_in;
    162     tmp0_l -= p7_l_in;
    163     tmp1_l += tmp0_l;
    164     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    165 
    166     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    167     p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
    168     ST_UB(p5, src);
    169     src += pitch;
    170 
    171     /* p4 */
    172     q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
    173     tmp0_r = p4_r_in - p5_r_in;
    174     tmp0_r += q2_r_in;
    175     tmp0_r -= p7_r_in;
    176     tmp1_r += tmp0_r;
    177     r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
    178 
    179     q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
    180     tmp0_l = p4_l_in - p5_l_in;
    181     tmp0_l += q2_l_in;
    182     tmp0_l -= p7_l_in;
    183     tmp1_l += tmp0_l;
    184     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    185 
    186     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    187     p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
    188     ST_UB(p4, src);
    189     src += pitch;
    190 
    191     /* p3 */
    192     q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
    193     tmp0_r = p3_r_in - p4_r_in;
    194     tmp0_r += q3_r_in;
    195     tmp0_r -= p7_r_in;
    196     tmp1_r += tmp0_r;
    197     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    198 
    199     q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
    200     tmp0_l = p3_l_in - p4_l_in;
    201     tmp0_l += q3_l_in;
    202     tmp0_l -= p7_l_in;
    203     tmp1_l += tmp0_l;
    204     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    205 
    206     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    207     p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
    208     ST_UB(p3, src);
    209     src += pitch;
    210 
    211     /* p2 */
    212     q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
    213     filter8 = LD_UB(filter48);
    214     tmp0_r = p2_r_in - p3_r_in;
    215     tmp0_r += q4_r_in;
    216     tmp0_r -= p7_r_in;
    217     tmp1_r += tmp0_r;
    218     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    219 
    220     q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
    221     tmp0_l = p2_l_in - p3_l_in;
    222     tmp0_l += q4_l_in;
    223     tmp0_l -= p7_l_in;
    224     tmp1_l += tmp0_l;
    225     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    226 
    227     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    228     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    229     ST_UB(filter8, src);
    230     src += pitch;
    231 
    232     /* p1 */
    233     q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
    234     filter8 = LD_UB(filter48 + 16);
    235     tmp0_r = p1_r_in - p2_r_in;
    236     tmp0_r += q5_r_in;
    237     tmp0_r -= p7_r_in;
    238     tmp1_r += tmp0_r;
    239     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    240 
    241     q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
    242     tmp0_l = p1_l_in - p2_l_in;
    243     tmp0_l += q5_l_in;
    244     tmp0_l -= p7_l_in;
    245     tmp1_l += tmp0_l;
    246     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    247 
    248     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    249     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    250     ST_UB(filter8, src);
    251     src += pitch;
    252 
    253     /* p0 */
    254     q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
    255     filter8 = LD_UB(filter48 + 32);
    256     tmp0_r = p0_r_in - p1_r_in;
    257     tmp0_r += q6_r_in;
    258     tmp0_r -= p7_r_in;
    259     tmp1_r += tmp0_r;
    260     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    261 
    262     q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
    263     tmp0_l = p0_l_in - p1_l_in;
    264     tmp0_l += q6_l_in;
    265     tmp0_l -= p7_l_in;
    266     tmp1_l += tmp0_l;
    267     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    268 
    269     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    270     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    271     ST_UB(filter8, src);
    272     src += pitch;
    273 
    274     /* q0 */
    275     q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
    276     filter8 = LD_UB(filter48 + 48);
    277     tmp0_r = q7_r_in - p0_r_in;
    278     tmp0_r += q0_r_in;
    279     tmp0_r -= p7_r_in;
    280     tmp1_r += tmp0_r;
    281     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    282 
    283     q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
    284     tmp0_l = q7_l_in - p0_l_in;
    285     tmp0_l += q0_l_in;
    286     tmp0_l -= p7_l_in;
    287     tmp1_l += tmp0_l;
    288     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    289 
    290     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    291     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    292     ST_UB(filter8, src);
    293     src += pitch;
    294 
    295     /* q1 */
    296     filter8 = LD_UB(filter48 + 64);
    297     tmp0_r = q7_r_in - q0_r_in;
    298     tmp0_r += q1_r_in;
    299     tmp0_r -= p6_r_in;
    300     tmp1_r += tmp0_r;
    301     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    302 
    303     tmp0_l = q7_l_in - q0_l_in;
    304     tmp0_l += q1_l_in;
    305     tmp0_l -= p6_l_in;
    306     tmp1_l += tmp0_l;
    307     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    308 
    309     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    310     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    311     ST_UB(filter8, src);
    312     src += pitch;
    313 
    314     /* q2 */
    315     filter8 = LD_UB(filter48 + 80);
    316     tmp0_r = q7_r_in - q1_r_in;
    317     tmp0_r += q2_r_in;
    318     tmp0_r -= p5_r_in;
    319     tmp1_r += tmp0_r;
    320     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    321 
    322     tmp0_l = q7_l_in - q1_l_in;
    323     tmp0_l += q2_l_in;
    324     tmp0_l -= p5_l_in;
    325     tmp1_l += tmp0_l;
    326     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    327 
    328     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    329     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    330     ST_UB(filter8, src);
    331     src += pitch;
    332 
    333     /* q3 */
    334     tmp0_r = q7_r_in - q2_r_in;
    335     tmp0_r += q3_r_in;
    336     tmp0_r -= p4_r_in;
    337     tmp1_r += tmp0_r;
    338     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    339 
    340     tmp0_l = q7_l_in - q2_l_in;
    341     tmp0_l += q3_l_in;
    342     tmp0_l -= p4_l_in;
    343     tmp1_l += tmp0_l;
    344     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    345 
    346     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    347     q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
    348     ST_UB(q3, src);
    349     src += pitch;
    350 
    351     /* q4 */
    352     tmp0_r = q7_r_in - q3_r_in;
    353     tmp0_r += q4_r_in;
    354     tmp0_r -= p3_r_in;
    355     tmp1_r += tmp0_r;
    356     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    357 
    358     tmp0_l = q7_l_in - q3_l_in;
    359     tmp0_l += q4_l_in;
    360     tmp0_l -= p3_l_in;
    361     tmp1_l += tmp0_l;
    362     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    363 
    364     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    365     q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
    366     ST_UB(q4, src);
    367     src += pitch;
    368 
    369     /* q5 */
    370     tmp0_r = q7_r_in - q4_r_in;
    371     tmp0_r += q5_r_in;
    372     tmp0_r -= p2_r_in;
    373     tmp1_r += tmp0_r;
    374     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    375 
    376     tmp0_l = q7_l_in - q4_l_in;
    377     tmp0_l += q5_l_in;
    378     tmp0_l -= p2_l_in;
    379     tmp1_l += tmp0_l;
    380     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    381 
    382     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    383     q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
    384     ST_UB(q5, src);
    385     src += pitch;
    386 
    387     /* q6 */
    388     tmp0_r = q7_r_in - q5_r_in;
    389     tmp0_r += q6_r_in;
    390     tmp0_r -= p1_r_in;
    391     tmp1_r += tmp0_r;
    392     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    393 
    394     tmp0_l = q7_l_in - q5_l_in;
    395     tmp0_l += q6_l_in;
    396     tmp0_l -= p1_l_in;
    397     tmp1_l += tmp0_l;
    398     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
    399 
    400     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
    401     q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
    402     ST_UB(q6, src);
    403   }
    404 }
    405 
    406 static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
    407                                         const uint8_t *b_limit_ptr,
    408                                         const uint8_t *limit_ptr,
    409                                         const uint8_t *thresh_ptr,
    410                                         int32_t count) {
    411   DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
    412   uint8_t early_exit = 0;
    413 
    414   (void)count;
    415 
    416   early_exit = vpx_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
    417                                         limit_ptr, thresh_ptr);
    418 
    419   if (0 == early_exit) {
    420     vpx_hz_lpf_t16_16w(src, pitch, filter48);
    421   }
    422 }
    423 
    424 static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
    425                                    const uint8_t *b_limit_ptr,
    426                                    const uint8_t *limit_ptr,
    427                                    const uint8_t *thresh_ptr, int32_t count) {
    428   if (1 == count) {
    429     uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
    430     uint64_t dword0, dword1;
    431     v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
    432     v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
    433     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
    434     v16u8 p0_filter16, p1_filter16;
    435     v8i16 p2_filter8, p1_filter8, p0_filter8;
    436     v8i16 q0_filter8, q1_filter8, q2_filter8;
    437     v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
    438     v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
    439     v16i8 zero = { 0 };
    440     v8u16 tmp0, tmp1, tmp2;
    441 
    442     /* load vector elements */
    443     LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
    444 
    445     thresh = (v16u8)__msa_fill_b(*thresh_ptr);
    446     b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
    447     limit = (v16u8)__msa_fill_b(*limit_ptr);
    448 
    449     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
    450                  mask, flat);
    451     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
    452     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
    453                        q1_out);
    454 
    455     flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
    456 
    457     if (__msa_test_bz_v(flat)) {
    458       p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
    459       p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
    460       q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
    461       q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
    462       SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
    463     } else {
    464       /* convert 8 bit input data into 16 bit */
    465       ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
    466                  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
    467                  q3_r);
    468       VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
    469                   p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
    470 
    471       /* convert 16 bit output data into 8 bit */
    472       PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
    473                   q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
    474       PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
    475 
    476       /* store pixel values */
    477       p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
    478       p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
    479       p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
    480       q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
    481       q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
    482       q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
    483 
    484       /* load 16 vector elements */
    485       LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
    486       LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
    487 
    488       VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
    489 
    490       if (__msa_test_bz_v(flat2)) {
    491         p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
    492         p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
    493         p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
    494         q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
    495         q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
    496         q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
    497 
    498         SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
    499         SD(q1_d, src + pitch);
    500         SD(q2_d, src + 2 * pitch);
    501       } else {
    502         /* LSB(right) 8 pixel operation */
    503         ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
    504                    zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
    505                    q7_r);
    506 
    507         tmp0 = p7_r << 3;
    508         tmp0 -= p7_r;
    509         tmp0 += p6_r;
    510         tmp0 += q0_r;
    511 
    512         src -= 7 * pitch;
    513 
    514         /* calculation of p6 and p5 */
    515         tmp1 = p6_r + p5_r + p4_r + p3_r;
    516         tmp1 += (p2_r + p1_r + p0_r);
    517         tmp1 += tmp0;
    518         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    519         tmp0 = p5_r - p6_r + q1_r - p7_r;
    520         tmp1 += tmp0;
    521         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    522         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
    523                     p1_filter16);
    524         p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
    525         p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
    526         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
    527         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
    528         SD(dword0, src);
    529         src += pitch;
    530         SD(dword1, src);
    531         src += pitch;
    532 
    533         /* calculation of p4 and p3 */
    534         tmp0 = p4_r - p5_r + q2_r - p7_r;
    535         tmp2 = p3_r - p4_r + q3_r - p7_r;
    536         tmp1 += tmp0;
    537         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    538         tmp1 += tmp2;
    539         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    540         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
    541                     p1_filter16);
    542         p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
    543         p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
    544         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
    545         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
    546         SD(dword0, src);
    547         src += pitch;
    548         SD(dword1, src);
    549         src += pitch;
    550 
    551         /* calculation of p2 and p1 */
    552         tmp0 = p2_r - p3_r + q4_r - p7_r;
    553         tmp2 = p1_r - p2_r + q5_r - p7_r;
    554         tmp1 += tmp0;
    555         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    556         tmp1 += tmp2;
    557         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    558         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
    559                     p1_filter16);
    560         p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
    561         p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
    562         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
    563         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
    564         SD(dword0, src);
    565         src += pitch;
    566         SD(dword1, src);
    567         src += pitch;
    568 
    569         /* calculation of p0 and q0 */
    570         tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
    571         tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
    572         tmp1 += tmp0;
    573         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    574         tmp1 += tmp2;
    575         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    576         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
    577                     p1_filter16);
    578         p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
    579         p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
    580         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
    581         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
    582         SD(dword0, src);
    583         src += pitch;
    584         SD(dword1, src);
    585         src += pitch;
    586 
    587         /* calculation of q1 and q2 */
    588         tmp0 = q7_r - q0_r + q1_r - p6_r;
    589         tmp2 = q7_r - q1_r + q2_r - p5_r;
    590         tmp1 += tmp0;
    591         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    592         tmp1 += tmp2;
    593         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    594         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
    595                     p1_filter16);
    596         p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
    597         p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
    598         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
    599         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
    600         SD(dword0, src);
    601         src += pitch;
    602         SD(dword1, src);
    603         src += pitch;
    604 
    605         /* calculation of q3 and q4 */
    606         tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
    607         tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
    608         tmp1 += tmp0;
    609         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    610         tmp1 += tmp2;
    611         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    612         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
    613                     p1_filter16);
    614         p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
    615         p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
    616         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
    617         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
    618         SD(dword0, src);
    619         src += pitch;
    620         SD(dword1, src);
    621         src += pitch;
    622 
    623         /* calculation of q5 and q6 */
    624         tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
    625         tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
    626         tmp1 += tmp0;
    627         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    628         tmp1 += tmp2;
    629         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
    630         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
    631                     p1_filter16);
    632         p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
    633         p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
    634         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
    635         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
    636         SD(dword0, src);
    637         src += pitch;
    638         SD(dword1, src);
    639       }
    640     }
    641   } else {
    642     mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr,
    643                                 count);
    644   }
    645 }
    646 
    647 void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
    648                                const uint8_t *b_limit_ptr,
    649                                const uint8_t *limit_ptr,
    650                                const uint8_t *thresh_ptr) {
    651   mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
    652 }
    653 
    654 void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
    655                                     const uint8_t *b_limit_ptr,
    656                                     const uint8_t *limit_ptr,
    657                                     const uint8_t *thresh_ptr) {
    658   mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
    659 }
    660 
    661 static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
    662                                    uint8_t *output, int32_t out_pitch) {
    663   v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
    664   v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    665   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
    666 
    667   LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
    668          p1_org, p0_org);
    669   /* 8x8 transpose */
    670   TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
    671                      p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
    672   /* 8x8 transpose */
    673   ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
    674              tmp0, tmp1, tmp2, tmp3);
    675   ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
    676   ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
    677   ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
    678   ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
    679   SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
    680 
    681   ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
    682   output += (8 * out_pitch);
    683   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
    684 }
    685 
    686 static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
    687                                    uint8_t *output, int32_t out_pitch) {
    688   v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
    689   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
    690 
    691   LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
    692   LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
    693   TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
    694                       q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
    695   ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
    696 }
    697 
    698 static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
    699                             int32_t out_pitch) {
    700   v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
    701   v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
    702   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
    703   v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
    704   v4i32 tmp2, tmp3;
    705 
    706   LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
    707   input += (8 * in_pitch);
    708   LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
    709 
    710   TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
    711                       row9, row10, row11, row12, row13, row14, row15, p7, p6,
    712                       p5, p4, p3, p2, p1, p0);
    713 
    714   /* transpose 16x8 matrix into 8x16 */
    715   /* total 8 intermediate register and 32 instructions */
    716   q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
    717   q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
    718   q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
    719   q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
    720   q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
    721   q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
    722   q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
    723   q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
    724 
    725   ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
    726   tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
    727   tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
    728 
    729   ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
    730   tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
    731   tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
    732 
    733   ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
    734   q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
    735   q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
    736 
    737   tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
    738   tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
    739   q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
    740   q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
    741 
    742   ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
    743   q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
    744   q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
    745 
    746   tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
    747   tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
    748   q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
    749   q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
    750 
    751   ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
    752   output += (8 * out_pitch);
    753   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
    754 }
    755 
    756 int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
    757                                 uint8_t *src_org, int32_t pitch_org,
    758                                 const uint8_t *b_limit_ptr,
    759                                 const uint8_t *limit_ptr,
    760                                 const uint8_t *thresh_ptr) {
    761   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    762   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
    763   v16u8 flat, mask, hev, thresh, b_limit, limit;
    764   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
    765   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
    766   v16i8 zero = { 0 };
    767   v8i16 vec0, vec1, vec2, vec3;
    768 
    769   /* load vector elements */
    770   LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
    771 
    772   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
    773   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
    774   limit = (v16u8)__msa_fill_b(*limit_ptr);
    775 
    776   /* mask and hev */
    777   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
    778                mask, flat);
    779   /* flat4 */
    780   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
    781   /* filter4 */
    782   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
    783 
    784   flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
    785 
    786   if (__msa_test_bz_v(flat)) {
    787     ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
    788     ILVRL_H2_SH(vec1, vec0, vec2, vec3);
    789     ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
    790     return 1;
    791   } else {
    792     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
    793                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
    794     VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
    795                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
    796 
    797     /* convert 16 bit output data into 8 bit */
    798     p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
    799     p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
    800     p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
    801     q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
    802     q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
    803     q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
    804 
    805     /* store pixel values */
    806     p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
    807     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
    808     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
    809     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
    810     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
    811     q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
    812 
    813     ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
    814     filter48 += (4 * 16);
    815     ST_UB2(q1_out, q2_out, filter48, 16);
    816     filter48 += (2 * 16);
    817     ST_UB(flat, filter48);
    818 
    819     return 0;
    820   }
    821 }
    822 
    823 int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
    824                           uint8_t *filter48) {
    825   v16i8 zero = { 0 };
    826   v16u8 filter8, flat, flat2;
    827   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
    828   v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
    829   v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
    830   v8u16 tmp0_r, tmp1_r;
    831   v8i16 r_out;
    832 
    833   flat = LD_UB(filter48 + 6 * 16);
    834 
    835   LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
    836   LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
    837 
    838   VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
    839 
    840   if (__msa_test_bz_v(flat2)) {
    841     v8i16 vec0, vec1, vec2, vec3, vec4;
    842 
    843     LD_UB4(filter48, 16, p2, p1, p0, q0);
    844     LD_UB2(filter48 + 4 * 16, 16, q1, q2);
    845 
    846     ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
    847     ILVRL_H2_SH(vec1, vec0, vec3, vec4);
    848     vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
    849 
    850     src_org -= 3;
    851     ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
    852     ST2x4_UB(vec2, 0, (src_org + 4), pitch);
    853     src_org += (4 * pitch);
    854     ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
    855     ST2x4_UB(vec2, 4, (src_org + 4), pitch);
    856 
    857     return 1;
    858   } else {
    859     src -= 7 * 16;
    860 
    861     ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
    862                p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
    863                p2_r_in, p1_r_in, p0_r_in);
    864     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
    865 
    866     tmp0_r = p7_r_in << 3;
    867     tmp0_r -= p7_r_in;
    868     tmp0_r += p6_r_in;
    869     tmp0_r += q0_r_in;
    870     tmp1_r = p6_r_in + p5_r_in;
    871     tmp1_r += p4_r_in;
    872     tmp1_r += p3_r_in;
    873     tmp1_r += p2_r_in;
    874     tmp1_r += p1_r_in;
    875     tmp1_r += p0_r_in;
    876     tmp1_r += tmp0_r;
    877 
    878     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    879     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    880     p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
    881     ST8x1_UB(p6, src);
    882     src += 16;
    883 
    884     /* p5 */
    885     q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
    886     tmp0_r = p5_r_in - p6_r_in;
    887     tmp0_r += q1_r_in;
    888     tmp0_r -= p7_r_in;
    889     tmp1_r += tmp0_r;
    890     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    891     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    892     p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
    893     ST8x1_UB(p5, src);
    894     src += 16;
    895 
    896     /* p4 */
    897     q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
    898     tmp0_r = p4_r_in - p5_r_in;
    899     tmp0_r += q2_r_in;
    900     tmp0_r -= p7_r_in;
    901     tmp1_r += tmp0_r;
    902     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    903     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    904     p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
    905     ST8x1_UB(p4, src);
    906     src += 16;
    907 
    908     /* p3 */
    909     q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
    910     tmp0_r = p3_r_in - p4_r_in;
    911     tmp0_r += q3_r_in;
    912     tmp0_r -= p7_r_in;
    913     tmp1_r += tmp0_r;
    914     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    915     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    916     p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
    917     ST8x1_UB(p3, src);
    918     src += 16;
    919 
    920     /* p2 */
    921     q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
    922     filter8 = LD_UB(filter48);
    923     tmp0_r = p2_r_in - p3_r_in;
    924     tmp0_r += q4_r_in;
    925     tmp0_r -= p7_r_in;
    926     tmp1_r += tmp0_r;
    927     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    928     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    929     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    930     ST8x1_UB(filter8, src);
    931     src += 16;
    932 
    933     /* p1 */
    934     q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
    935     filter8 = LD_UB(filter48 + 16);
    936     tmp0_r = p1_r_in - p2_r_in;
    937     tmp0_r += q5_r_in;
    938     tmp0_r -= p7_r_in;
    939     tmp1_r += tmp0_r;
    940     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    941     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    942     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    943     ST8x1_UB(filter8, src);
    944     src += 16;
    945 
    946     /* p0 */
    947     q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
    948     filter8 = LD_UB(filter48 + 32);
    949     tmp0_r = p0_r_in - p1_r_in;
    950     tmp0_r += q6_r_in;
    951     tmp0_r -= p7_r_in;
    952     tmp1_r += tmp0_r;
    953     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    954     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    955     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    956     ST8x1_UB(filter8, src);
    957     src += 16;
    958 
    959     /* q0 */
    960     q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
    961     filter8 = LD_UB(filter48 + 48);
    962     tmp0_r = q7_r_in - p0_r_in;
    963     tmp0_r += q0_r_in;
    964     tmp0_r -= p7_r_in;
    965     tmp1_r += tmp0_r;
    966     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    967     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    968     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    969     ST8x1_UB(filter8, src);
    970     src += 16;
    971 
    972     /* q1 */
    973     filter8 = LD_UB(filter48 + 64);
    974     tmp0_r = q7_r_in - q0_r_in;
    975     tmp0_r += q1_r_in;
    976     tmp0_r -= p6_r_in;
    977     tmp1_r += tmp0_r;
    978     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    979     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    980     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    981     ST8x1_UB(filter8, src);
    982     src += 16;
    983 
    984     /* q2 */
    985     filter8 = LD_UB(filter48 + 80);
    986     tmp0_r = q7_r_in - q1_r_in;
    987     tmp0_r += q2_r_in;
    988     tmp0_r -= p5_r_in;
    989     tmp1_r += tmp0_r;
    990     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
    991     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
    992     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
    993     ST8x1_UB(filter8, src);
    994     src += 16;
    995 
    996     /* q3 */
    997     tmp0_r = q7_r_in - q2_r_in;
    998     tmp0_r += q3_r_in;
    999     tmp0_r -= p4_r_in;
   1000     tmp1_r += tmp0_r;
   1001     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1002     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
   1003     q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
   1004     ST8x1_UB(q3, src);
   1005     src += 16;
   1006 
   1007     /* q4 */
   1008     tmp0_r = q7_r_in - q3_r_in;
   1009     tmp0_r += q4_r_in;
   1010     tmp0_r -= p3_r_in;
   1011     tmp1_r += tmp0_r;
   1012     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1013     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
   1014     q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
   1015     ST8x1_UB(q4, src);
   1016     src += 16;
   1017 
   1018     /* q5 */
   1019     tmp0_r = q7_r_in - q4_r_in;
   1020     tmp0_r += q5_r_in;
   1021     tmp0_r -= p2_r_in;
   1022     tmp1_r += tmp0_r;
   1023     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1024     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
   1025     q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
   1026     ST8x1_UB(q5, src);
   1027     src += 16;
   1028 
   1029     /* q6 */
   1030     tmp0_r = q7_r_in - q5_r_in;
   1031     tmp0_r += q6_r_in;
   1032     tmp0_r -= p1_r_in;
   1033     tmp1_r += tmp0_r;
   1034     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1035     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
   1036     q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
   1037     ST8x1_UB(q6, src);
   1038 
   1039     return 0;
   1040   }
   1041 }
   1042 
   1043 void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
   1044                              const uint8_t *b_limit_ptr,
   1045                              const uint8_t *limit_ptr,
   1046                              const uint8_t *thresh_ptr) {
   1047   uint8_t early_exit = 0;
   1048   DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
   1049   uint8_t *filter48 = &transposed_input[16 * 16];
   1050 
   1051   transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
   1052 
   1053   early_exit =
   1054       vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
   1055                               pitch, b_limit_ptr, limit_ptr, thresh_ptr);
   1056 
   1057   if (0 == early_exit) {
   1058     early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
   1059                                    &filter48[0]);
   1060 
   1061     if (0 == early_exit) {
   1062       transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
   1063     }
   1064   }
   1065 }
   1066 
   1067 int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
   1068                                  uint8_t *src_org, int32_t pitch,
   1069                                  const uint8_t *b_limit_ptr,
   1070                                  const uint8_t *limit_ptr,
   1071                                  const uint8_t *thresh_ptr) {
   1072   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
   1073   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
   1074   v16u8 flat, mask, hev, thresh, b_limit, limit;
   1075   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
   1076   v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
   1077   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
   1078   v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
   1079   v16i8 zero = { 0 };
   1080   v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
   1081 
   1082   /* load vector elements */
   1083   LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
   1084 
   1085   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
   1086   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
   1087   limit = (v16u8)__msa_fill_b(*limit_ptr);
   1088 
   1089   /* mask and hev */
   1090   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
   1091                mask, flat);
   1092   /* flat4 */
   1093   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
   1094   /* filter4 */
   1095   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
   1096 
   1097   if (__msa_test_bz_v(flat)) {
   1098     ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
   1099     ILVRL_H2_SH(vec1, vec0, vec2, vec3);
   1100     ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
   1101     ILVRL_H2_SH(vec1, vec0, vec4, vec5);
   1102 
   1103     src_org -= 2;
   1104     ST4x8_UB(vec2, vec3, src_org, pitch);
   1105     src_org += 8 * pitch;
   1106     ST4x8_UB(vec4, vec5, src_org, pitch);
   1107 
   1108     return 1;
   1109   } else {
   1110     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
   1111                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
   1112     VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
   1113                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
   1114     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
   1115     ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
   1116     VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
   1117                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
   1118 
   1119     /* convert 16 bit output data into 8 bit */
   1120     PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
   1121                 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
   1122                 p0_filt8_r, q0_filt8_r);
   1123     PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
   1124                 q2_filt8_r);
   1125 
   1126     /* store pixel values */
   1127     p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
   1128     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
   1129     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
   1130     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
   1131     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
   1132     q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
   1133 
   1134     ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
   1135     filter48 += (4 * 16);
   1136     ST_UB2(q1_out, q2_out, filter48, 16);
   1137     filter48 += (2 * 16);
   1138     ST_UB(flat, filter48);
   1139 
   1140     return 0;
   1141   }
   1142 }
   1143 
   1144 int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
   1145                            uint8_t *filter48) {
   1146   v16u8 flat, flat2, filter8;
   1147   v16i8 zero = { 0 };
   1148   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
   1149   v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
   1150   v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
   1151   v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
   1152   v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
   1153   v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
   1154   v8i16 l_out, r_out;
   1155 
   1156   flat = LD_UB(filter48 + 6 * 16);
   1157 
   1158   LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
   1159   LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
   1160 
   1161   VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
   1162 
   1163   if (__msa_test_bz_v(flat2)) {
   1164     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
   1165 
   1166     LD_UB4(filter48, 16, p2, p1, p0, q0);
   1167     LD_UB2(filter48 + 4 * 16, 16, q1, q2);
   1168 
   1169     ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
   1170     ILVRL_H2_SH(vec1, vec0, vec3, vec4);
   1171     ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
   1172     ILVRL_H2_SH(vec1, vec0, vec6, vec7);
   1173     ILVRL_B2_SH(q2, q1, vec2, vec5);
   1174 
   1175     src_org -= 3;
   1176     ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
   1177     ST2x4_UB(vec2, 0, (src_org + 4), pitch);
   1178     src_org += (4 * pitch);
   1179     ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
   1180     ST2x4_UB(vec2, 4, (src_org + 4), pitch);
   1181     src_org += (4 * pitch);
   1182     ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
   1183     ST2x4_UB(vec5, 0, (src_org + 4), pitch);
   1184     src_org += (4 * pitch);
   1185     ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
   1186     ST2x4_UB(vec5, 4, (src_org + 4), pitch);
   1187 
   1188     return 1;
   1189   } else {
   1190     src -= 7 * 16;
   1191 
   1192     ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
   1193                p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
   1194                p2_r_in, p1_r_in, p0_r_in);
   1195     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
   1196 
   1197     tmp0_r = p7_r_in << 3;
   1198     tmp0_r -= p7_r_in;
   1199     tmp0_r += p6_r_in;
   1200     tmp0_r += q0_r_in;
   1201     tmp1_r = p6_r_in + p5_r_in;
   1202     tmp1_r += p4_r_in;
   1203     tmp1_r += p3_r_in;
   1204     tmp1_r += p2_r_in;
   1205     tmp1_r += p1_r_in;
   1206     tmp1_r += p0_r_in;
   1207     tmp1_r += tmp0_r;
   1208     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1209 
   1210     ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
   1211                p5_l_in, p4_l_in);
   1212     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
   1213                p1_l_in, p0_l_in);
   1214     q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
   1215 
   1216     tmp0_l = p7_l_in << 3;
   1217     tmp0_l -= p7_l_in;
   1218     tmp0_l += p6_l_in;
   1219     tmp0_l += q0_l_in;
   1220     tmp1_l = p6_l_in + p5_l_in;
   1221     tmp1_l += p4_l_in;
   1222     tmp1_l += p3_l_in;
   1223     tmp1_l += p2_l_in;
   1224     tmp1_l += p1_l_in;
   1225     tmp1_l += p0_l_in;
   1226     tmp1_l += tmp0_l;
   1227     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1228 
   1229     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1230     p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
   1231     ST_UB(p6, src);
   1232     src += 16;
   1233 
   1234     /* p5 */
   1235     q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
   1236     tmp0_r = p5_r_in - p6_r_in;
   1237     tmp0_r += q1_r_in;
   1238     tmp0_r -= p7_r_in;
   1239     tmp1_r += tmp0_r;
   1240     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1241     q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
   1242     tmp0_l = p5_l_in - p6_l_in;
   1243     tmp0_l += q1_l_in;
   1244     tmp0_l -= p7_l_in;
   1245     tmp1_l += tmp0_l;
   1246     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1247     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1248     p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
   1249     ST_UB(p5, src);
   1250     src += 16;
   1251 
   1252     /* p4 */
   1253     q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
   1254     tmp0_r = p4_r_in - p5_r_in;
   1255     tmp0_r += q2_r_in;
   1256     tmp0_r -= p7_r_in;
   1257     tmp1_r += tmp0_r;
   1258     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1259     q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
   1260     tmp0_l = p4_l_in - p5_l_in;
   1261     tmp0_l += q2_l_in;
   1262     tmp0_l -= p7_l_in;
   1263     tmp1_l += tmp0_l;
   1264     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1265     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1266     p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
   1267     ST_UB(p4, src);
   1268     src += 16;
   1269 
   1270     /* p3 */
   1271     q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
   1272     tmp0_r = p3_r_in - p4_r_in;
   1273     tmp0_r += q3_r_in;
   1274     tmp0_r -= p7_r_in;
   1275     tmp1_r += tmp0_r;
   1276     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1277     q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
   1278     tmp0_l = p3_l_in - p4_l_in;
   1279     tmp0_l += q3_l_in;
   1280     tmp0_l -= p7_l_in;
   1281     tmp1_l += tmp0_l;
   1282     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1283     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1284     p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
   1285     ST_UB(p3, src);
   1286     src += 16;
   1287 
   1288     /* p2 */
   1289     q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
   1290     filter8 = LD_UB(filter48);
   1291     tmp0_r = p2_r_in - p3_r_in;
   1292     tmp0_r += q4_r_in;
   1293     tmp0_r -= p7_r_in;
   1294     tmp1_r += tmp0_r;
   1295     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1296     q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
   1297     tmp0_l = p2_l_in - p3_l_in;
   1298     tmp0_l += q4_l_in;
   1299     tmp0_l -= p7_l_in;
   1300     tmp1_l += tmp0_l;
   1301     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1302     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1303     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
   1304     ST_UB(filter8, src);
   1305     src += 16;
   1306 
   1307     /* p1 */
   1308     q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
   1309     filter8 = LD_UB(filter48 + 16);
   1310     tmp0_r = p1_r_in - p2_r_in;
   1311     tmp0_r += q5_r_in;
   1312     tmp0_r -= p7_r_in;
   1313     tmp1_r += tmp0_r;
   1314     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1315     q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
   1316     tmp0_l = p1_l_in - p2_l_in;
   1317     tmp0_l += q5_l_in;
   1318     tmp0_l -= p7_l_in;
   1319     tmp1_l += tmp0_l;
   1320     l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
   1321     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1322     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
   1323     ST_UB(filter8, src);
   1324     src += 16;
   1325 
   1326     /* p0 */
   1327     q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
   1328     filter8 = LD_UB(filter48 + 32);
   1329     tmp0_r = p0_r_in - p1_r_in;
   1330     tmp0_r += q6_r_in;
   1331     tmp0_r -= p7_r_in;
   1332     tmp1_r += tmp0_r;
   1333     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1334     q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
   1335     tmp0_l = p0_l_in - p1_l_in;
   1336     tmp0_l += q6_l_in;
   1337     tmp0_l -= p7_l_in;
   1338     tmp1_l += tmp0_l;
   1339     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1340     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1341     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
   1342     ST_UB(filter8, src);
   1343     src += 16;
   1344 
   1345     /* q0 */
   1346     q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
   1347     filter8 = LD_UB(filter48 + 48);
   1348     tmp0_r = q7_r_in - p0_r_in;
   1349     tmp0_r += q0_r_in;
   1350     tmp0_r -= p7_r_in;
   1351     tmp1_r += tmp0_r;
   1352     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1353     q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
   1354     tmp0_l = q7_l_in - p0_l_in;
   1355     tmp0_l += q0_l_in;
   1356     tmp0_l -= p7_l_in;
   1357     tmp1_l += tmp0_l;
   1358     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1359     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1360     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
   1361     ST_UB(filter8, src);
   1362     src += 16;
   1363 
   1364     /* q1 */
   1365     filter8 = LD_UB(filter48 + 64);
   1366     tmp0_r = q7_r_in - q0_r_in;
   1367     tmp0_r += q1_r_in;
   1368     tmp0_r -= p6_r_in;
   1369     tmp1_r += tmp0_r;
   1370     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1371     tmp0_l = q7_l_in - q0_l_in;
   1372     tmp0_l += q1_l_in;
   1373     tmp0_l -= p6_l_in;
   1374     tmp1_l += tmp0_l;
   1375     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1376     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1377     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
   1378     ST_UB(filter8, src);
   1379     src += 16;
   1380 
   1381     /* q2 */
   1382     filter8 = LD_UB(filter48 + 80);
   1383     tmp0_r = q7_r_in - q1_r_in;
   1384     tmp0_r += q2_r_in;
   1385     tmp0_r -= p5_r_in;
   1386     tmp1_r += tmp0_r;
   1387     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1388     tmp0_l = q7_l_in - q1_l_in;
   1389     tmp0_l += q2_l_in;
   1390     tmp0_l -= p5_l_in;
   1391     tmp1_l += tmp0_l;
   1392     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1393     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1394     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
   1395     ST_UB(filter8, src);
   1396     src += 16;
   1397 
   1398     /* q3 */
   1399     tmp0_r = q7_r_in - q2_r_in;
   1400     tmp0_r += q3_r_in;
   1401     tmp0_r -= p4_r_in;
   1402     tmp1_r += tmp0_r;
   1403     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1404     tmp0_l = q7_l_in - q2_l_in;
   1405     tmp0_l += q3_l_in;
   1406     tmp0_l -= p4_l_in;
   1407     tmp1_l += tmp0_l;
   1408     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1409     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1410     q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
   1411     ST_UB(q3, src);
   1412     src += 16;
   1413 
   1414     /* q4 */
   1415     tmp0_r = q7_r_in - q3_r_in;
   1416     tmp0_r += q4_r_in;
   1417     tmp0_r -= p3_r_in;
   1418     tmp1_r += tmp0_r;
   1419     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1420     tmp0_l = q7_l_in - q3_l_in;
   1421     tmp0_l += q4_l_in;
   1422     tmp0_l -= p3_l_in;
   1423     tmp1_l += tmp0_l;
   1424     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1425     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1426     q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
   1427     ST_UB(q4, src);
   1428     src += 16;
   1429 
   1430     /* q5 */
   1431     tmp0_r = q7_r_in - q4_r_in;
   1432     tmp0_r += q5_r_in;
   1433     tmp0_r -= p2_r_in;
   1434     tmp1_r += tmp0_r;
   1435     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1436     tmp0_l = q7_l_in - q4_l_in;
   1437     tmp0_l += q5_l_in;
   1438     tmp0_l -= p2_l_in;
   1439     tmp1_l += tmp0_l;
   1440     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1441     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1442     q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
   1443     ST_UB(q5, src);
   1444     src += 16;
   1445 
   1446     /* q6 */
   1447     tmp0_r = q7_r_in - q5_r_in;
   1448     tmp0_r += q6_r_in;
   1449     tmp0_r -= p1_r_in;
   1450     tmp1_r += tmp0_r;
   1451     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
   1452     tmp0_l = q7_l_in - q5_l_in;
   1453     tmp0_l += q6_l_in;
   1454     tmp0_l -= p1_l_in;
   1455     tmp1_l += tmp0_l;
   1456     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
   1457     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
   1458     q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
   1459     ST_UB(q6, src);
   1460 
   1461     return 0;
   1462   }
   1463 }
   1464 
   1465 void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
   1466                                   const uint8_t *b_limit_ptr,
   1467                                   const uint8_t *limit_ptr,
   1468                                   const uint8_t *thresh_ptr) {
   1469   uint8_t early_exit = 0;
   1470   DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
   1471   uint8_t *filter48 = &transposed_input[16 * 16];
   1472 
   1473   transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
   1474 
   1475   early_exit =
   1476       vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
   1477                                pitch, b_limit_ptr, limit_ptr, thresh_ptr);
   1478 
   1479   if (0 == early_exit) {
   1480     early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
   1481                                     &filter48[0]);
   1482 
   1483     if (0 == early_exit) {
   1484       transpose_16x16(transposed_input, 16, (src - 8), pitch);
   1485     }
   1486   }
   1487 }
   1488