Home | History | Annotate | Download | only in msa
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #ifndef VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
     12 #define VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
     13 
     14 #include <msa.h>
     15 
     16 #include "./vpx_config.h"
     17 #include "vpx/vpx_integer.h"
     18 
     19 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
     20 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
     21 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
     22 
     23 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
     24 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
     25 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
     26 
     27 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
     28 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
     29 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
     30 
     31 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
     32 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
     33 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
     34 
     35 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
     36 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
     37 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
     38 
     39 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
     40 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
     41 
     42 #if (__mips_isa_rev >= 6)
     43 #define LW(psrc)                                     \
     44   ({                                                 \
     45     const uint8_t *psrc_m = (const uint8_t *)(psrc); \
     46     uint32_t val_m;                                  \
     47                                                      \
     48     asm volatile("lw  %[val_m],  %[psrc_m]  \n\t"    \
     49                                                      \
     50                  : [val_m] "=r"(val_m)               \
     51                  : [psrc_m] "m"(*psrc_m));           \
     52                                                      \
     53     val_m;                                           \
     54   })
     55 
     56 #if (__mips == 64)
     57 #define LD(psrc)                                     \
     58   ({                                                 \
     59     const uint8_t *psrc_m = (const uint8_t *)(psrc); \
     60     uint64_t val_m = 0;                              \
     61                                                      \
     62     asm volatile("ld  %[val_m],  %[psrc_m]  \n\t"    \
     63                                                      \
     64                  : [val_m] "=r"(val_m)               \
     65                  : [psrc_m] "m"(*psrc_m));           \
     66                                                      \
     67     val_m;                                           \
     68   })
     69 #else  // !(__mips == 64)
     70 #define LD(psrc)                                            \
     71   ({                                                        \
     72     const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
     73     uint32_t val0_m, val1_m;                                \
     74     uint64_t val_m = 0;                                     \
     75                                                             \
     76     val0_m = LW(psrc_m);                                    \
     77     val1_m = LW(psrc_m + 4);                                \
     78                                                             \
     79     val_m = (uint64_t)(val1_m);                             \
     80     val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
     81     val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
     82                                                             \
     83     val_m;                                                  \
     84   })
     85 #endif  // (__mips == 64)
     86 
     87 #define SH(val, pdst)                             \
     88   {                                               \
     89     uint8_t *pdst_m = (uint8_t *)(pdst);          \
     90     const uint16_t val_m = (val);                 \
     91                                                   \
     92     asm volatile("sh  %[val_m],  %[pdst_m]  \n\t" \
     93                                                   \
     94                  : [pdst_m] "=m"(*pdst_m)         \
     95                  : [val_m] "r"(val_m));           \
     96   }
     97 
     98 #define SW(val, pdst)                             \
     99   {                                               \
    100     uint8_t *pdst_m = (uint8_t *)(pdst);          \
    101     const uint32_t val_m = (val);                 \
    102                                                   \
    103     asm volatile("sw  %[val_m],  %[pdst_m]  \n\t" \
    104                                                   \
    105                  : [pdst_m] "=m"(*pdst_m)         \
    106                  : [val_m] "r"(val_m));           \
    107   }
    108 
    109 #define SD(val, pdst)                             \
    110   {                                               \
    111     uint8_t *pdst_m = (uint8_t *)(pdst);          \
    112     const uint64_t val_m = (val);                 \
    113                                                   \
    114     asm volatile("sd  %[val_m],  %[pdst_m]  \n\t" \
    115                                                   \
    116                  : [pdst_m] "=m"(*pdst_m)         \
    117                  : [val_m] "r"(val_m));           \
    118   }
    119 #else  // !(__mips_isa_rev >= 6)
    120 #define LW(psrc)                                     \
    121   ({                                                 \
    122     const uint8_t *psrc_m = (const uint8_t *)(psrc); \
    123     uint32_t val_m;                                  \
    124                                                      \
    125     asm volatile("ulw  %[val_m],  %[psrc_m]  \n\t"   \
    126                                                      \
    127                  : [val_m] "=r"(val_m)               \
    128                  : [psrc_m] "m"(*psrc_m));           \
    129                                                      \
    130     val_m;                                           \
    131   })
    132 
    133 #if (__mips == 64)
    134 #define LD(psrc)                                     \
    135   ({                                                 \
    136     const uint8_t *psrc_m = (const uint8_t *)(psrc); \
    137     uint64_t val_m = 0;                              \
    138                                                      \
    139     asm volatile("uld  %[val_m],  %[psrc_m]  \n\t"   \
    140                                                      \
    141                  : [val_m] "=r"(val_m)               \
    142                  : [psrc_m] "m"(*psrc_m));           \
    143                                                      \
    144     val_m;                                           \
    145   })
    146 #else  // !(__mips == 64)
    147 #define LD(psrc)                                            \
    148   ({                                                        \
    149     const uint8_t *psrc_m1 = (const uint8_t *)(psrc);       \
    150     uint32_t val0_m, val1_m;                                \
    151     uint64_t val_m = 0;                                     \
    152                                                             \
    153     val0_m = LW(psrc_m1);                                   \
    154     val1_m = LW(psrc_m1 + 4);                               \
    155                                                             \
    156     val_m = (uint64_t)(val1_m);                             \
    157     val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
    158     val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
    159                                                             \
    160     val_m;                                                  \
    161   })
    162 #endif  // (__mips == 64)
    163 #define SH(val, pdst)                              \
    164   {                                                \
    165     uint8_t *pdst_m = (uint8_t *)(pdst);           \
    166     const uint16_t val_m = (val);                  \
    167                                                    \
    168     asm volatile("ush  %[val_m],  %[pdst_m]  \n\t" \
    169                                                    \
    170                  : [pdst_m] "=m"(*pdst_m)          \
    171                  : [val_m] "r"(val_m));            \
    172   }
    173 
    174 #define SW(val, pdst)                              \
    175   {                                                \
    176     uint8_t *pdst_m = (uint8_t *)(pdst);           \
    177     const uint32_t val_m = (val);                  \
    178                                                    \
    179     asm volatile("usw  %[val_m],  %[pdst_m]  \n\t" \
    180                                                    \
    181                  : [pdst_m] "=m"(*pdst_m)          \
    182                  : [val_m] "r"(val_m));            \
    183   }
    184 
    185 #define SD(val, pdst)                                        \
    186   {                                                          \
    187     uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
    188     uint32_t val0_m, val1_m;                                 \
    189                                                              \
    190     val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
    191     val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
    192                                                              \
    193     SW(val0_m, pdst_m1);                                     \
    194     SW(val1_m, pdst_m1 + 4);                                 \
    195   }
    196 #endif  // (__mips_isa_rev >= 6)
    197 
    198 /* Description : Load 4 words with stride
    199    Arguments   : Inputs  - psrc, stride
    200                  Outputs - out0, out1, out2, out3
    201    Details     : Load word in 'out0' from (psrc)
    202                  Load word in 'out1' from (psrc + stride)
    203                  Load word in 'out2' from (psrc + 2 * stride)
    204                  Load word in 'out3' from (psrc + 3 * stride)
    205 */
    206 #define LW4(psrc, stride, out0, out1, out2, out3) \
    207   {                                               \
    208     out0 = LW((psrc));                            \
    209     out1 = LW((psrc) + stride);                   \
    210     out2 = LW((psrc) + 2 * stride);               \
    211     out3 = LW((psrc) + 3 * stride);               \
    212   }
    213 
    214 /* Description : Load double words with stride
    215    Arguments   : Inputs  - psrc, stride
    216                  Outputs - out0, out1
    217    Details     : Load double word in 'out0' from (psrc)
    218                  Load double word in 'out1' from (psrc + stride)
    219 */
    220 #define LD2(psrc, stride, out0, out1) \
    221   {                                   \
    222     out0 = LD((psrc));                \
    223     out1 = LD((psrc) + stride);       \
    224   }
    225 #define LD4(psrc, stride, out0, out1, out2, out3) \
    226   {                                               \
    227     LD2((psrc), stride, out0, out1);              \
    228     LD2((psrc) + 2 * stride, stride, out2, out3); \
    229   }
    230 
    231 /* Description : Store 4 words with stride
    232    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
    233    Details     : Store word from 'in0' to (pdst)
    234                  Store word from 'in1' to (pdst + stride)
    235                  Store word from 'in2' to (pdst + 2 * stride)
    236                  Store word from 'in3' to (pdst + 3 * stride)
    237 */
    238 #define SW4(in0, in1, in2, in3, pdst, stride) \
    239   {                                           \
    240     SW(in0, (pdst));                          \
    241     SW(in1, (pdst) + stride);                 \
    242     SW(in2, (pdst) + 2 * stride);             \
    243     SW(in3, (pdst) + 3 * stride);             \
    244   }
    245 
    246 /* Description : Store 4 double words with stride
    247    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
    248    Details     : Store double word from 'in0' to (pdst)
    249                  Store double word from 'in1' to (pdst + stride)
    250                  Store double word from 'in2' to (pdst + 2 * stride)
    251                  Store double word from 'in3' to (pdst + 3 * stride)
    252 */
    253 #define SD4(in0, in1, in2, in3, pdst, stride) \
    254   {                                           \
    255     SD(in0, (pdst));                          \
    256     SD(in1, (pdst) + stride);                 \
    257     SD(in2, (pdst) + 2 * stride);             \
    258     SD(in3, (pdst) + 3 * stride);             \
    259   }
    260 
    261 /* Description : Load vectors with 16 byte elements with stride
    262    Arguments   : Inputs  - psrc, stride
    263                  Outputs - out0, out1
    264                  Return Type - as per RTYPE
    265    Details     : Load 16 byte elements in 'out0' from (psrc)
    266                  Load 16 byte elements in 'out1' from (psrc + stride)
    267 */
    268 #define LD_B2(RTYPE, psrc, stride, out0, out1) \
    269   {                                            \
    270     out0 = LD_B(RTYPE, (psrc));                \
    271     out1 = LD_B(RTYPE, (psrc) + stride);       \
    272   }
    273 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
    274 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
    275 
    276 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
    277   {                                                  \
    278     LD_B2(RTYPE, (psrc), stride, out0, out1);        \
    279     out2 = LD_B(RTYPE, (psrc) + 2 * stride);         \
    280   }
    281 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
    282 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
    283 
    284 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
    285   {                                                        \
    286     LD_B2(RTYPE, (psrc), stride, out0, out1);              \
    287     LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
    288   }
    289 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
    290 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
    291 
    292 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
    293   {                                                              \
    294     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
    295     out4 = LD_B(RTYPE, (psrc) + 4 * stride);                     \
    296   }
    297 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
    298 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
    299 
    300 #define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
    301               out7)                                                          \
    302   {                                                                          \
    303     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
    304     LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
    305   }
    306 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
    307 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
    308 
    309 /* Description : Load vectors with 8 halfword elements with stride
    310    Arguments   : Inputs  - psrc, stride
    311                  Outputs - out0, out1
    312    Details     : Load 8 halfword elements in 'out0' from (psrc)
    313                  Load 8 halfword elements in 'out1' from (psrc + stride)
    314 */
    315 #define LD_H2(RTYPE, psrc, stride, out0, out1) \
    316   {                                            \
    317     out0 = LD_H(RTYPE, (psrc));                \
    318     out1 = LD_H(RTYPE, (psrc) + (stride));     \
    319   }
    320 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
    321 
    322 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
    323   {                                                        \
    324     LD_H2(RTYPE, (psrc), stride, out0, out1);              \
    325     LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
    326   }
    327 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
    328 
    329 /* Description : Load 2 vectors of signed word elements with stride
    330    Arguments   : Inputs  - psrc, stride
    331                  Outputs - out0, out1
    332                  Return Type - signed word
    333 */
    334 #define LD_SW2(psrc, stride, out0, out1) \
    335   {                                      \
    336     out0 = LD_SW((psrc));                \
    337     out1 = LD_SW((psrc) + stride);       \
    338   }
    339 
    340 /* Description : Store vectors of 16 byte elements with stride
    341    Arguments   : Inputs - in0, in1, pdst, stride
    342    Details     : Store 16 byte elements from 'in0' to (pdst)
    343                  Store 16 byte elements from 'in1' to (pdst + stride)
    344 */
    345 #define ST_B2(RTYPE, in0, in1, pdst, stride) \
    346   {                                          \
    347     ST_B(RTYPE, in0, (pdst));                \
    348     ST_B(RTYPE, in1, (pdst) + stride);       \
    349   }
    350 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
    351 
    352 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
    353   {                                                      \
    354     ST_B2(RTYPE, in0, in1, (pdst), stride);              \
    355     ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
    356   }
    357 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
    358 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
    359 
    360 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
    361   {                                                                        \
    362     ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
    363     ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
    364   }
    365 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
    366 
    367 /* Description : Store vectors of 8 halfword elements with stride
    368    Arguments   : Inputs - in0, in1, pdst, stride
    369    Details     : Store 8 halfword elements from 'in0' to (pdst)
    370                  Store 8 halfword elements from 'in1' to (pdst + stride)
    371 */
    372 #define ST_H2(RTYPE, in0, in1, pdst, stride) \
    373   {                                          \
    374     ST_H(RTYPE, in0, (pdst));                \
    375     ST_H(RTYPE, in1, (pdst) + stride);       \
    376   }
    377 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
    378 
    379 /* Description : Store vectors of word elements with stride
    380    Arguments   : Inputs - in0, in1, pdst, stride
    381    Details     : Store 4 word elements from 'in0' to (pdst)
    382                  Store 4 word elements from 'in1' to (pdst + stride)
    383 */
    384 #define ST_SW2(in0, in1, pdst, stride) \
    385   {                                    \
    386     ST_SW(in0, (pdst));                \
    387     ST_SW(in1, (pdst) + stride);       \
    388   }
    389 
    390 /* Description : Store 2x4 byte block to destination memory from input vector
    391    Arguments   : Inputs - in, stidx, pdst, stride
    392    Details     : Index 'stidx' halfword element from 'in' vector is copied to
    393                  the GP register and stored to (pdst)
    394                  Index 'stidx+1' halfword element from 'in' vector is copied to
    395                  the GP register and stored to (pdst + stride)
    396                  Index 'stidx+2' halfword element from 'in' vector is copied to
    397                  the GP register and stored to (pdst + 2 * stride)
    398                  Index 'stidx+3' halfword element from 'in' vector is copied to
    399                  the GP register and stored to (pdst + 3 * stride)
    400 */
    401 #define ST2x4_UB(in, stidx, pdst, stride)            \
    402   {                                                  \
    403     uint16_t out0_m, out1_m, out2_m, out3_m;         \
    404     uint8_t *pblk_2x4_m = (uint8_t *)(pdst);         \
    405                                                      \
    406     out0_m = __msa_copy_u_h((v8i16)in, (stidx));     \
    407     out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
    408     out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
    409     out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
    410                                                      \
    411     SH(out0_m, pblk_2x4_m);                          \
    412     SH(out1_m, pblk_2x4_m + stride);                 \
    413     SH(out2_m, pblk_2x4_m + 2 * stride);             \
    414     SH(out3_m, pblk_2x4_m + 3 * stride);             \
    415   }
    416 
    417 /* Description : Store 4x4 byte block to destination memory from input vector
    418    Arguments   : Inputs - in0, in1, pdst, stride
    419    Details     : 'Idx0' word element from input vector 'in0' is copied to the
    420                  GP register and stored to (pdst)
    421                  'Idx1' word element from input vector 'in0' is copied to the
    422                  GP register and stored to (pdst + stride)
    423                  'Idx2' word element from input vector 'in0' is copied to the
    424                  GP register and stored to (pdst + 2 * stride)
    425                  'Idx3' word element from input vector 'in0' is copied to the
    426                  GP register and stored to (pdst + 3 * stride)
    427 */
    428 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
    429   {                                                              \
    430     uint32_t out0_m, out1_m, out2_m, out3_m;                     \
    431     uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                     \
    432                                                                  \
    433     out0_m = __msa_copy_u_w((v4i32)in0, idx0);                   \
    434     out1_m = __msa_copy_u_w((v4i32)in0, idx1);                   \
    435     out2_m = __msa_copy_u_w((v4i32)in1, idx2);                   \
    436     out3_m = __msa_copy_u_w((v4i32)in1, idx3);                   \
    437                                                                  \
    438     SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);     \
    439   }
    440 #define ST4x8_UB(in0, in1, pdst, stride)                           \
    441   {                                                                \
    442     uint8_t *pblk_4x8 = (uint8_t *)(pdst);                         \
    443                                                                    \
    444     ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);              \
    445     ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
    446   }
    447 
    448 /* Description : Store 8x1 byte block to destination memory from input vector
    449    Arguments   : Inputs - in, pdst
    450    Details     : Index 0 double word element from 'in' vector is copied to the
    451                  GP register and stored to (pdst)
    452 */
    453 #define ST8x1_UB(in, pdst)                 \
    454   {                                        \
    455     uint64_t out0_m;                       \
    456                                            \
    457     out0_m = __msa_copy_u_d((v2i64)in, 0); \
    458     SD(out0_m, pdst);                      \
    459   }
    460 
    461 /* Description : Store 8x2 byte block to destination memory from input vector
    462    Arguments   : Inputs - in, pdst, stride
    463    Details     : Index 0 double word element from 'in' vector is copied to the
    464                  GP register and stored to (pdst)
    465                  Index 1 double word element from 'in' vector is copied to the
    466                  GP register and stored to (pdst + stride)
    467 */
    468 #define ST8x2_UB(in, pdst, stride)           \
    469   {                                          \
    470     uint64_t out0_m, out1_m;                 \
    471     uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
    472                                              \
    473     out0_m = __msa_copy_u_d((v2i64)in, 0);   \
    474     out1_m = __msa_copy_u_d((v2i64)in, 1);   \
    475                                              \
    476     SD(out0_m, pblk_8x2_m);                  \
    477     SD(out1_m, pblk_8x2_m + stride);         \
    478   }
    479 
    480 /* Description : Store 8x4 byte block to destination memory from input
    481                  vectors
    482    Arguments   : Inputs - in0, in1, pdst, stride
    483    Details     : Index 0 double word element from 'in0' vector is copied to the
    484                  GP register and stored to (pdst)
    485                  Index 1 double word element from 'in0' vector is copied to the
    486                  GP register and stored to (pdst + stride)
    487                  Index 0 double word element from 'in1' vector is copied to the
    488                  GP register and stored to (pdst + 2 * stride)
    489                  Index 1 double word element from 'in1' vector is copied to the
    490                  GP register and stored to (pdst + 3 * stride)
    491 */
    492 #define ST8x4_UB(in0, in1, pdst, stride)                     \
    493   {                                                          \
    494     uint64_t out0_m, out1_m, out2_m, out3_m;                 \
    495     uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                 \
    496                                                              \
    497     out0_m = __msa_copy_u_d((v2i64)in0, 0);                  \
    498     out1_m = __msa_copy_u_d((v2i64)in0, 1);                  \
    499     out2_m = __msa_copy_u_d((v2i64)in1, 0);                  \
    500     out3_m = __msa_copy_u_d((v2i64)in1, 1);                  \
    501                                                              \
    502     SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
    503   }
    504 
    505 /* Description : Immediate number of elements to slide with zero
    506    Arguments   : Inputs  - in0, in1, slide_val
    507                  Outputs - out0, out1
    508                  Return Type - as per RTYPE
    509    Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
    510                  value specified in the 'slide_val'
    511 */
    512 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)             \
    513   {                                                                   \
    514     v16i8 zero_m = { 0 };                                             \
    515                                                                       \
    516     out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
    517     out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
    518   }
    519 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
    520 
    521 /* Description : Immediate number of elements to slide
    522    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
    523                  Outputs - out0, out1
    524                  Return Type - as per RTYPE
    525    Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
    526                  value specified in the 'slide_val'
    527 */
    528 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
    529   {                                                                       \
    530     out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);    \
    531     out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);    \
    532   }
    533 
    534 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
    535                 out2, slide_val)                                             \
    536   {                                                                          \
    537     SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val);       \
    538     out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);       \
    539   }
    540 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
    541 
    542 /* Description : Shuffle byte vector elements as per mask vector
    543    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
    544                  Outputs - out0, out1
    545                  Return Type - as per RTYPE
    546    Details     : Byte elements from 'in0' & 'in1' are copied selectively to
    547                  'out0' as per control vector 'mask0'
    548 */
    549 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
    550   {                                                                   \
    551     out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
    552     out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
    553   }
    554 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
    555 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
    556 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
    557 
    558 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
    559                 out0, out1, out2)                                         \
    560   {                                                                       \
    561     VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);         \
    562     out2 = (RTYPE)__msa_vshf_b((v16i8)mask2, (v16i8)in5, (v16i8)in4);     \
    563   }
    564 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
    565 
    566 /* Description : Shuffle halfword vector elements as per mask vector
    567    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
    568                  Outputs - out0, out1
    569                  Return Type - as per RTYPE
    570    Details     : halfword elements from 'in0' & 'in1' are copied selectively to
    571                  'out0' as per control vector 'mask0'
    572 */
    573 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
    574   {                                                                   \
    575     out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0); \
    576     out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2); \
    577   }
    578 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
    579 
    580 /* Description : Dot product of byte vector elements
    581    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    582                  Outputs - out0, out1
    583                  Return Type - as per RTYPE
    584    Details     : Unsigned byte elements from 'mult0' are multiplied with
    585                  unsigned byte elements from 'cnst0' producing a result
    586                  twice the size of input i.e. unsigned halfword.
    587                  The multiplication result of adjacent odd-even elements
    588                  are added together and written to the 'out0' vector
    589 */
    590 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
    591   {                                                             \
    592     out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);   \
    593     out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);   \
    594   }
    595 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
    596 
    597 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
    598                  cnst3, out0, out1, out2, out3)                          \
    599   {                                                                      \
    600     DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
    601     DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
    602   }
    603 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
    604 
    605 /* Description : Dot product of byte vector elements
    606    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    607                  Outputs - out0, out1
    608                  Return Type - as per RTYPE
    609    Details     : Signed byte elements from 'mult0' are multiplied with
    610                  signed byte elements from 'cnst0' producing a result
    611                  twice the size of input i.e. signed halfword.
    612                  The multiplication result of adjacent odd-even elements
    613                  are added together and written to the 'out0' vector
    614 */
    615 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
    616   {                                                             \
    617     out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);   \
    618     out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);   \
    619   }
    620 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
    621 
    622 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
    623                  cnst3, out0, out1, out2, out3)                          \
    624   {                                                                      \
    625     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
    626     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
    627   }
    628 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
    629 
    630 /* Description : Dot product of halfword vector elements
    631    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    632                  Outputs - out0, out1
    633                  Return Type - as per RTYPE
    634    Details     : Signed halfword elements from 'mult0' are multiplied with
    635                  signed halfword elements from 'cnst0' producing a result
    636                  twice the size of input i.e. signed word.
    637                  The multiplication result of adjacent odd-even elements
    638                  are added together and written to the 'out0' vector
    639 */
    640 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
    641   {                                                             \
    642     out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);   \
    643     out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);   \
    644   }
    645 
    646 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
    647                  cnst3, out0, out1, out2, out3)                          \
    648   {                                                                      \
    649     DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
    650     DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
    651   }
    652 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
    653 
    654 /* Description : Dot product of word vector elements
    655    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    656                  Outputs - out0, out1
    657                  Return Type - as per RTYPE
    658    Details     : Signed word elements from 'mult0' are multiplied with
    659                  signed word elements from 'cnst0' producing a result
    660                  twice the size of input i.e. signed double word.
    661                  The multiplication result of adjacent odd-even elements
    662                  are added together and written to the 'out0' vector
    663 */
    664 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
    665   {                                                             \
    666     out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);   \
    667     out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);   \
    668   }
    669 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
    670 
    671 /* Description : Dot product & addition of byte vector elements
    672    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    673                  Outputs - out0, out1
    674                  Return Type - as per RTYPE
    675    Details     : Signed byte elements from 'mult0' are multiplied with
    676                  signed byte elements from 'cnst0' producing a result
    677                  twice the size of input i.e. signed halfword.
    678                  The multiplication result of adjacent odd-even elements
    679                  are added to the 'out0' vector
    680 */
    681 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
    682   {                                                                         \
    683     out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
    684     out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
    685   }
    686 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
    687 
    688 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
    689                   cnst3, out0, out1, out2, out3)                          \
    690   {                                                                       \
    691     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
    692     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
    693   }
    694 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
    695 
    696 /* Description : Dot product & addition of halfword vector elements
    697    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    698                  Outputs - out0, out1
    699                  Return Type - as per RTYPE
    700    Details     : Signed halfword elements from 'mult0' are multiplied with
    701                  signed halfword elements from 'cnst0' producing a result
    702                  twice the size of input i.e. signed word.
    703                  The multiplication result of adjacent odd-even elements
    704                  are added to the 'out0' vector
    705 */
    706 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
    707   {                                                                         \
    708     out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
    709     out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
    710   }
    711 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
    712 
    713 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
    714                   cnst3, out0, out1, out2, out3)                          \
    715   {                                                                       \
    716     DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
    717     DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
    718   }
    719 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
    720 
    721 /* Description : Dot product & addition of double word vector elements
    722    Arguments   : Inputs  - mult0, mult1
    723                  Outputs - out0, out1
    724                  Return Type - as per RTYPE
    725    Details     : Each signed word element from 'mult0' is multiplied with itself
    726                  producing an intermediate result twice the size of it
    727                  i.e. signed double word
    728                  The multiplication result of adjacent odd-even elements
    729                  are added to the 'out0' vector
    730 */
    731 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1)                          \
    732   {                                                                         \
    733     out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
    734     out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
    735   }
    736 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
    737 
    738 /* Description : Clips all signed halfword elements of input vector
    739                  between 0 & 255
    740    Arguments   : Input  - in
    741                  Output - out_m
    742                  Return Type - signed halfword
    743 */
    744 #define CLIP_SH_0_255(in)                              \
    745   ({                                                   \
    746     v8i16 max_m = __msa_ldi_h(255);                    \
    747     v8i16 out_m;                                       \
    748                                                        \
    749     out_m = __msa_maxi_s_h((v8i16)in, 0);              \
    750     out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
    751     out_m;                                             \
    752   })
    753 #define CLIP_SH2_0_255(in0, in1) \
    754   {                              \
    755     in0 = CLIP_SH_0_255(in0);    \
    756     in1 = CLIP_SH_0_255(in1);    \
    757   }
    758 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
    759   {                                        \
    760     CLIP_SH2_0_255(in0, in1);              \
    761     CLIP_SH2_0_255(in2, in3);              \
    762   }
    763 
    764 /* Description : Clips all signed word elements of input vector
    765                  between 0 & 255
    766    Arguments   : Input  - in
    767                  Output - out_m
    768                  Return Type - signed word
    769 */
    770 #define CLIP_SW_0_255(in)                              \
    771   ({                                                   \
    772     v4i32 max_m = __msa_ldi_w(255);                    \
    773     v4i32 out_m;                                       \
    774                                                        \
    775     out_m = __msa_maxi_s_w((v4i32)in, 0);              \
    776     out_m = __msa_min_s_w((v4i32)max_m, (v4i32)out_m); \
    777     out_m;                                             \
    778   })
    779 
    780 /* Description : Horizontal addition of 4 signed word elements of input vector
    781    Arguments   : Input  - in       (signed word vector)
    782                  Output - sum_m    (i32 sum)
    783                  Return Type - signed word (GP)
    784    Details     : 4 signed word elements of 'in' vector are added together and
    785                  the resulting integer sum is returned
    786 */
    787 #define HADD_SW_S32(in)                            \
    788   ({                                               \
    789     v2i64 res0_m, res1_m;                          \
    790     int32_t sum_m;                                 \
    791                                                    \
    792     res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
    793     res1_m = __msa_splati_d(res0_m, 1);            \
    794     res0_m = res0_m + res1_m;                      \
    795     sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
    796     sum_m;                                         \
    797   })
    798 
    799 /* Description : Horizontal addition of 8 unsigned halfword elements
    800    Arguments   : Inputs  - in       (unsigned halfword vector)
    801                  Outputs - sum_m    (u32 sum)
    802                  Return Type - unsigned word
    803    Details     : 8 unsigned halfword elements of input vector are added
    804                  together and the resulting integer sum is returned
    805 */
    806 #define HADD_UH_U32(in)                               \
    807   ({                                                  \
    808     v4u32 res_m;                                      \
    809     v2u64 res0_m, res1_m;                             \
    810     uint32_t sum_m;                                   \
    811                                                       \
    812     res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);     \
    813     res0_m = __msa_hadd_u_d(res_m, res_m);            \
    814     res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
    815     res0_m = res0_m + res1_m;                         \
    816     sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
    817     sum_m;                                            \
    818   })
    819 
    820 /* Description : Horizontal addition of unsigned byte vector elements
    821    Arguments   : Inputs  - in0, in1
    822                  Outputs - out0, out1
    823                  Return Type - as per RTYPE
    824    Details     : Each unsigned odd byte element from 'in0' is added to
    825                  even unsigned byte element from 'in0' (pairwise) and the
    826                  halfword result is written to 'out0'
    827 */
    828 #define HADD_UB2(RTYPE, in0, in1, out0, out1)             \
    829   {                                                       \
    830     out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
    831     out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
    832   }
    833 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
    834 
    835 /* Description : Horizontal subtraction of unsigned byte vector elements
    836    Arguments   : Inputs  - in0, in1
    837                  Outputs - out0, out1
    838                  Return Type - as per RTYPE
    839    Details     : Each unsigned odd byte element from 'in0' is subtracted from
    840                  even unsigned byte element from 'in0' (pairwise) and the
    841                  halfword result is written to 'out0'
    842 */
    843 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)             \
    844   {                                                       \
    845     out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
    846     out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
    847   }
    848 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
    849 
    850 /* Description : Horizontal subtraction of signed halfword vector elements
    851    Arguments   : Inputs  - in0, in1
    852                  Outputs - out0, out1
    853                  Return Type - as per RTYPE
    854    Details     : Each signed odd halfword element from 'in0' is subtracted from
    855                  even signed halfword element from 'in0' (pairwise) and the
    856                  word result is written to 'out0'
    857 */
    858 #define HSUB_UH2(RTYPE, in0, in1, out0, out1)             \
    859   {                                                       \
    860     out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
    861     out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
    862   }
    863 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
    864 
    865 /* Description : Set element n input vector to GPR value
    866    Arguments   : Inputs - in0, in1, in2, in3
    867                  Output - out
    868                  Return Type - as per RTYPE
    869    Details     : Set element 0 in vector 'out' to value specified in 'in0'
    870 */
    871 #define INSERT_D2(RTYPE, in0, in1, out)              \
    872   {                                                  \
    873     out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
    874     out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
    875   }
    876 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
    877 
    878 /* Description : Interleave even byte elements from vectors
    879    Arguments   : Inputs  - in0, in1, in2, in3
    880                  Outputs - out0, out1
    881                  Return Type - as per RTYPE
    882    Details     : Even byte elements of 'in0' and 'in1' are interleaved
    883                  and written to 'out0'
    884 */
    885 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
    886   {                                                      \
    887     out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
    888     out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
    889   }
    890 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
    891 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
    892 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
    893 
    894 /* Description : Interleave even halfword elements from vectors
    895    Arguments   : Inputs  - in0, in1, in2, in3
    896                  Outputs - out0, out1
    897                  Return Type - as per RTYPE
    898    Details     : Even halfword elements of 'in0' and 'in1' are interleaved
    899                  and written to 'out0'
    900 */
    901 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
    902   {                                                      \
    903     out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
    904     out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
    905   }
    906 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
    907 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
    908 
    909 /* Description : Interleave even word elements from vectors
    910    Arguments   : Inputs  - in0, in1, in2, in3
    911                  Outputs - out0, out1
    912                  Return Type - as per RTYPE
    913    Details     : Even word elements of 'in0' and 'in1' are interleaved
    914                  and written to 'out0'
    915 */
    916 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
    917   {                                                      \
    918     out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
    919     out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
    920   }
    921 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
    922 
    923 /* Description : Interleave even double word elements from vectors
    924    Arguments   : Inputs  - in0, in1, in2, in3
    925                  Outputs - out0, out1
    926                  Return Type - as per RTYPE
    927    Details     : Even double word elements of 'in0' and 'in1' are interleaved
    928                  and written to 'out0'
    929 */
    930 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
    931   {                                                      \
    932     out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
    933     out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
    934   }
    935 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
    936 
    937 /* Description : Interleave left half of byte elements from vectors
    938    Arguments   : Inputs  - in0, in1, in2, in3
    939                  Outputs - out0, out1
    940                  Return Type - as per RTYPE
    941    Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
    942                  and written to 'out0'.
    943 */
    944 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
    945   {                                                     \
    946     out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
    947     out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
    948   }
    949 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
    950 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
    951 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
    952 
    953 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
    954                 out2, out3)                                                \
    955   {                                                                        \
    956     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
    957     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
    958   }
    959 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
    960 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
    961 
    962 /* Description : Interleave left half of halfword elements from vectors
    963    Arguments   : Inputs  - in0, in1, in2, in3
    964                  Outputs - out0, out1
    965                  Return Type - as per RTYPE
    966    Details     : Left half of halfword elements of 'in0' and 'in1' are
    967                  interleaved and written to 'out0'.
    968 */
    969 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
    970   {                                                     \
    971     out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
    972     out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
    973   }
    974 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
    975 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
    976 
    977 /* Description : Interleave left half of word elements from vectors
    978    Arguments   : Inputs  - in0, in1, in2, in3
    979                  Outputs - out0, out1
    980                  Return Type - as per RTYPE
    981    Details     : Left half of word elements of 'in0' and 'in1' are interleaved
    982                  and written to 'out0'.
    983 */
    984 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
    985   {                                                     \
    986     out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
    987     out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
    988   }
    989 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
    990 
    991 /* Description : Interleave right half of byte elements from vectors
    992    Arguments   : Inputs  - in0, in1, in2, in3
    993                  Outputs - out0, out1
    994                  Return Type - as per RTYPE
    995    Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
    996                  and written to out0.
    997 */
    998 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
    999   {                                                     \
   1000     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
   1001     out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
   1002   }
   1003 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
   1004 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
   1005 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
   1006 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
   1007 
   1008 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
   1009                 out2, out3)                                                \
   1010   {                                                                        \
   1011     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
   1012     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
   1013   }
   1014 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
   1015 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
   1016 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
   1017 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
   1018 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
   1019 
   1020 /* Description : Interleave right half of halfword elements from vectors
   1021    Arguments   : Inputs  - in0, in1, in2, in3
   1022                  Outputs - out0, out1
   1023                  Return Type - as per RTYPE
   1024    Details     : Right half of halfword elements of 'in0' and 'in1' are
   1025                  interleaved and written to 'out0'.
   1026 */
   1027 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1028   {                                                     \
   1029     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
   1030     out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
   1031   }
   1032 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
   1033 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
   1034 
   1035 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
   1036                 out2, out3)                                                \
   1037   {                                                                        \
   1038     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
   1039     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
   1040   }
   1041 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
   1042 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
   1043 
   1044 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1045   {                                                     \
   1046     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
   1047     out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
   1048   }
   1049 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
   1050 
   1051 /* Description : Interleave right half of double word elements from vectors
   1052    Arguments   : Inputs  - in0, in1, in2, in3
   1053                  Outputs - out0, out1
   1054                  Return Type - as per RTYPE
   1055    Details     : Right half of double word elements of 'in0' and 'in1' are
   1056                  interleaved and written to 'out0'.
   1057 */
   1058 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
   1059   {                                                         \
   1060     out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
   1061     out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
   1062   }
   1063 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
   1064 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
   1065 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
   1066 
   1067 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
   1068                 out2, out3)                                                \
   1069   {                                                                        \
   1070     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
   1071     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
   1072   }
   1073 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
   1074 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
   1075 
   1076 /* Description : Interleave both left and right half of input vectors
   1077    Arguments   : Inputs  - in0, in1
   1078                  Outputs - out0, out1
   1079                  Return Type - as per RTYPE
   1080    Details     : Right half of byte elements from 'in0' and 'in1' are
   1081                  interleaved and written to 'out0'
   1082 */
   1083 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
   1084   {                                                     \
   1085     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
   1086     out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
   1087   }
   1088 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
   1089 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
   1090 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
   1091 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
   1092 
   1093 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)           \
   1094   {                                                     \
   1095     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
   1096     out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
   1097   }
   1098 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
   1099 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
   1100 
   1101 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)           \
   1102   {                                                     \
   1103     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
   1104     out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
   1105   }
   1106 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
   1107 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
   1108 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
   1109 
   1110 /* Description : Maximum values between signed elements of vector and
   1111                  5-bit signed immediate value are copied to the output vector
   1112    Arguments   : Inputs  - in0, in1, in2, in3, max_val
   1113                  Outputs - in place operation
   1114                  Return Type - unsigned halfword
   1115    Details     : Maximum of signed halfword element values from 'in0' and
   1116                  'max_val' are written in place
   1117 */
   1118 #define MAXI_SH2(RTYPE, in0, in1, max_val)              \
   1119   {                                                     \
   1120     in0 = (RTYPE)__msa_maxi_s_h((v8i16)in0, (max_val)); \
   1121     in1 = (RTYPE)__msa_maxi_s_h((v8i16)in1, (max_val)); \
   1122   }
   1123 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
   1124 
   1125 /* Description : Saturate the halfword element values to the max
   1126                  unsigned value of (sat_val + 1) bits
   1127                  The element data width remains unchanged
   1128    Arguments   : Inputs  - in0, in1, sat_val
   1129                  Outputs - in place operation
   1130                  Return Type - as per RTYPE
   1131    Details     : Each unsigned halfword element from 'in0' is saturated to the
   1132                  value generated with (sat_val + 1) bit range.
   1133                  The results are written in place
   1134 */
   1135 #define SAT_UH2(RTYPE, in0, in1, sat_val)            \
   1136   {                                                  \
   1137     in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
   1138     in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
   1139   }
   1140 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
   1141 
   1142 /* Description : Saturate the halfword element values to the max
   1143                  unsigned value of (sat_val + 1) bits
   1144                  The element data width remains unchanged
   1145    Arguments   : Inputs  - in0, in1, sat_val
   1146                  Outputs - in place operation
   1147                  Return Type - as per RTYPE
   1148    Details     : Each unsigned halfword element from 'in0' is saturated to the
   1149                  value generated with (sat_val + 1) bit range
   1150                  The results are written in place
   1151 */
   1152 #define SAT_SH2(RTYPE, in0, in1, sat_val)            \
   1153   {                                                  \
   1154     in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
   1155     in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
   1156   }
   1157 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
   1158 
   1159 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
   1160   {                                                 \
   1161     SAT_SH2(RTYPE, in0, in1, sat_val);              \
   1162     SAT_SH2(RTYPE, in2, in3, sat_val);              \
   1163   }
   1164 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
   1165 
   1166 /* Description : Indexed halfword element values are replicated to all
   1167                  elements in output vector
   1168    Arguments   : Inputs  - in, idx0, idx1
   1169                  Outputs - out0, out1
   1170                  Return Type - as per RTYPE
   1171    Details     : 'idx0' element value from 'in' vector is replicated to all
   1172                   elements in 'out0' vector
   1173                   Valid index range for halfword operation is 0-7
   1174 */
   1175 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
   1176   {                                                  \
   1177     out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);   \
   1178     out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);   \
   1179   }
   1180 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
   1181 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
   1182 
   1183 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, out0, out1, out2) \
   1184   {                                                              \
   1185     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);                \
   1186     out2 = (RTYPE)__msa_splati_h((v8i16)in, idx2);               \
   1187   }
   1188 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
   1189 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
   1190 
   1191 /* Description : Indexed word element values are replicated to all
   1192                  elements in output vector
   1193    Arguments   : Inputs  - in, stidx
   1194                  Outputs - out0, out1
   1195                  Return Type - as per RTYPE
   1196    Details     : 'stidx' element value from 'in' vector is replicated to all
   1197                  elements in 'out0' vector
   1198                  'stidx + 1' element value from 'in' vector is replicated to all
   1199                  elements in 'out1' vector
   1200                  Valid index range for word operation is 0-3
   1201 */
   1202 #define SPLATI_W2(RTYPE, in, stidx, out0, out1)           \
   1203   {                                                       \
   1204     out0 = (RTYPE)__msa_splati_w((v4i32)in, stidx);       \
   1205     out1 = (RTYPE)__msa_splati_w((v4i32)in, (stidx + 1)); \
   1206   }
   1207 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
   1208 
   1209 /* Description : Pack even byte elements of vector pairs
   1210    Arguments   : Inputs  - in0, in1, in2, in3
   1211                  Outputs - out0, out1
   1212                  Return Type - as per RTYPE
   1213    Details     : Even byte elements of 'in0' are copied to the left half of
   1214                  'out0' & even byte elements of 'in1' are copied to the right
   1215                  half of 'out0'.
   1216 */
   1217 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1218   {                                                      \
   1219     out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
   1220     out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
   1221   }
   1222 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
   1223 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
   1224 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
   1225 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
   1226 
   1227 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
   1228                  out2, out3)                                                \
   1229   {                                                                         \
   1230     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
   1231     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
   1232   }
   1233 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
   1234 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
   1235 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
   1236 
   1237 /* Description : Pack even halfword elements of vector pairs
   1238    Arguments   : Inputs  - in0, in1, in2, in3
   1239                  Outputs - out0, out1
   1240                  Return Type - as per RTYPE
   1241    Details     : Even halfword elements of 'in0' are copied to the left half of
   1242                  'out0' & even halfword elements of 'in1' are copied to the
   1243                  right half of 'out0'.
   1244 */
   1245 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1246   {                                                      \
   1247     out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
   1248     out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
   1249   }
   1250 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
   1251 
   1252 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
   1253                  out2, out3)                                                \
   1254   {                                                                         \
   1255     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
   1256     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
   1257   }
   1258 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
   1259 
   1260 /* Description : Pack even double word elements of vector pairs
   1261    Arguments   : Inputs  - in0, in1, in2, in3
   1262                  Outputs - out0, out1
   1263                  Return Type - as per RTYPE
   1264    Details     : Even double elements of 'in0' are copied to the left half of
   1265                  'out0' & even double elements of 'in1' are copied to the right
   1266                  half of 'out0'.
   1267 */
   1268 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1269   {                                                      \
   1270     out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
   1271     out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
   1272   }
   1273 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
   1274 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
   1275 
   1276 /* Description : Pack odd double word elements of vector pairs
   1277    Arguments   : Inputs  - in0, in1, in2, in3
   1278                  Outputs - out0, out1
   1279                  Return Type - as per RTYPE
   1280    Details     : Odd double word elements of 'in0' are copied to the left half
   1281                  of 'out0' & odd double word elements of 'in1' are copied to
   1282                  the right half of 'out0'.
   1283 */
   1284 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1285   {                                                      \
   1286     out0 = (RTYPE)__msa_pckod_d((v2i64)in0, (v2i64)in1); \
   1287     out1 = (RTYPE)__msa_pckod_d((v2i64)in2, (v2i64)in3); \
   1288   }
   1289 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
   1290 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
   1291 
   1292 /* Description : Each byte element is logically xor'ed with immediate 128
   1293    Arguments   : Inputs  - in0, in1
   1294                  Outputs - in place operation
   1295                  Return Type - as per RTYPE
   1296    Details     : Each unsigned byte element from input vector 'in0' is
   1297                  logically xor'ed with 128 and the result is stored in-place.
   1298 */
   1299 #define XORI_B2_128(RTYPE, in0, in1)            \
   1300   {                                             \
   1301     in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
   1302     in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
   1303   }
   1304 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
   1305 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
   1306 
   1307 #define XORI_B3_128(RTYPE, in0, in1, in2)       \
   1308   {                                             \
   1309     XORI_B2_128(RTYPE, in0, in1);               \
   1310     in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
   1311   }
   1312 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
   1313 
   1314 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
   1315   {                                            \
   1316     XORI_B2_128(RTYPE, in0, in1);              \
   1317     XORI_B2_128(RTYPE, in2, in3);              \
   1318   }
   1319 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
   1320 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
   1321 
   1322 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
   1323   {                                                 \
   1324     XORI_B3_128(RTYPE, in0, in1, in2);              \
   1325     XORI_B2_128(RTYPE, in3, in4);                   \
   1326   }
   1327 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
   1328 
   1329 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
   1330   {                                                                \
   1331     XORI_B4_128(RTYPE, in0, in1, in2, in3);                        \
   1332     XORI_B4_128(RTYPE, in4, in5, in6, in7);                        \
   1333   }
   1334 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
   1335 
   1336 /* Description : Shift left all elements of vector (generic for all data types)
   1337    Arguments   : Inputs  - in0, in1, in2, in3, shift
   1338                  Outputs - in place operation
   1339                  Return Type - as per input vector RTYPE
   1340    Details     : Each element of vector 'in0' is left shifted by 'shift' and
   1341                  the result is written in-place.
   1342 */
   1343 #define SLLI_4V(in0, in1, in2, in3, shift) \
   1344   {                                        \
   1345     in0 = in0 << shift;                    \
   1346     in1 = in1 << shift;                    \
   1347     in2 = in2 << shift;                    \
   1348     in3 = in3 << shift;                    \
   1349   }
   1350 
   1351 /* Description : Arithmetic shift right all elements of vector
   1352                  (generic for all data types)
   1353    Arguments   : Inputs  - in0, in1, in2, in3, shift
   1354                  Outputs - in place operation
   1355                  Return Type - as per input vector RTYPE
   1356    Details     : Each element of vector 'in0' is right shifted by 'shift' and
   1357                  the result is written in-place. 'shift' is a GP variable.
   1358 */
   1359 #define SRA_4V(in0, in1, in2, in3, shift) \
   1360   {                                       \
   1361     in0 = in0 >> shift;                   \
   1362     in1 = in1 >> shift;                   \
   1363     in2 = in2 >> shift;                   \
   1364     in3 = in3 >> shift;                   \
   1365   }
   1366 
   1367 /* Description : Shift right arithmetic rounded words
   1368    Arguments   : Inputs  - in0, in1, shift
   1369                  Outputs - in place operation
   1370                  Return Type - as per RTYPE
   1371    Details     : Each element of vector 'in0' is shifted right arithmetically by
   1372                  the number of bits in the corresponding element in the vector
   1373                  'shift'. The last discarded bit is added to shifted value for
   1374                  rounding and the result is written in-place.
   1375                  'shift' is a vector.
   1376 */
   1377 #define SRAR_W2(RTYPE, in0, in1, shift)                  \
   1378   {                                                      \
   1379     in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
   1380     in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
   1381   }
   1382 
   1383 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
   1384   {                                               \
   1385     SRAR_W2(RTYPE, in0, in1, shift);              \
   1386     SRAR_W2(RTYPE, in2, in3, shift);              \
   1387   }
   1388 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
   1389 
   1390 /* Description : Shift right arithmetic rounded (immediate)
   1391    Arguments   : Inputs  - in0, in1, shift
   1392                  Outputs - in place operation
   1393                  Return Type - as per RTYPE
   1394    Details     : Each element of vector 'in0' is shifted right arithmetically by
   1395                  the value in 'shift'. The last discarded bit is added to the
   1396                  shifted value for rounding and the result is written in-place.
   1397                  'shift' is an immediate value.
   1398 */
   1399 #define SRARI_H2(RTYPE, in0, in1, shift)           \
   1400   {                                                \
   1401     in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
   1402     in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
   1403   }
   1404 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
   1405 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
   1406 
   1407 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
   1408   {                                                \
   1409     SRARI_H2(RTYPE, in0, in1, shift);              \
   1410     SRARI_H2(RTYPE, in2, in3, shift);              \
   1411   }
   1412 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
   1413 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
   1414 
   1415 #define SRARI_W2(RTYPE, in0, in1, shift)           \
   1416   {                                                \
   1417     in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
   1418     in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
   1419   }
   1420 
   1421 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
   1422   {                                                \
   1423     SRARI_W2(RTYPE, in0, in1, shift);              \
   1424     SRARI_W2(RTYPE, in2, in3, shift);              \
   1425   }
   1426 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
   1427 
   1428 /* Description : Multiplication of pairs of vectors
   1429    Arguments   : Inputs  - in0, in1, in2, in3
   1430                  Outputs - out0, out1
   1431    Details     : Each element from 'in0' is multiplied with elements from 'in1'
   1432                  and the result is written to 'out0'
   1433 */
   1434 #define MUL2(in0, in1, in2, in3, out0, out1) \
   1435   {                                          \
   1436     out0 = in0 * in1;                        \
   1437     out1 = in2 * in3;                        \
   1438   }
   1439 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
   1440   {                                                                          \
   1441     MUL2(in0, in1, in2, in3, out0, out1);                                    \
   1442     MUL2(in4, in5, in6, in7, out2, out3);                                    \
   1443   }
   1444 
   1445 /* Description : Addition of 2 pairs of vectors
   1446    Arguments   : Inputs  - in0, in1, in2, in3
   1447                  Outputs - out0, out1
   1448    Details     : Each element in 'in0' is added to 'in1' and result is written
   1449                  to 'out0'.
   1450 */
   1451 #define ADD2(in0, in1, in2, in3, out0, out1) \
   1452   {                                          \
   1453     out0 = in0 + in1;                        \
   1454     out1 = in2 + in3;                        \
   1455   }
   1456 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
   1457   {                                                                          \
   1458     ADD2(in0, in1, in2, in3, out0, out1);                                    \
   1459     ADD2(in4, in5, in6, in7, out2, out3);                                    \
   1460   }
   1461 
   1462 /* Description : Subtraction of 2 pairs of vectors
   1463    Arguments   : Inputs  - in0, in1, in2, in3
   1464                  Outputs - out0, out1
   1465    Details     : Each element in 'in1' is subtracted from 'in0' and result is
   1466                  written to 'out0'.
   1467 */
   1468 #define SUB2(in0, in1, in2, in3, out0, out1) \
   1469   {                                          \
   1470     out0 = in0 - in1;                        \
   1471     out1 = in2 - in3;                        \
   1472   }
   1473 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
   1474   {                                                                          \
   1475     out0 = in0 - in1;                                                        \
   1476     out1 = in2 - in3;                                                        \
   1477     out2 = in4 - in5;                                                        \
   1478     out3 = in6 - in7;                                                        \
   1479   }
   1480 
   1481 /* Description : Sign extend halfword elements from right half of the vector
   1482    Arguments   : Input  - in    (halfword vector)
   1483                  Output - out   (sign extended word vector)
   1484                  Return Type - signed word
   1485    Details     : Sign bit of halfword elements from input vector 'in' is
   1486                  extracted and interleaved with same vector 'in0' to generate
   1487                  4 word elements keeping sign intact
   1488 */
   1489 #define UNPCK_R_SH_SW(in, out)                    \
   1490   {                                               \
   1491     v8i16 sign_m;                                 \
   1492                                                   \
   1493     sign_m = __msa_clti_s_h((v8i16)in, 0);        \
   1494     out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
   1495   }
   1496 
   1497 /* Description : Zero extend unsigned byte elements to halfword elements
   1498    Arguments   : Input   - in          (unsigned byte vector)
   1499                  Outputs - out0, out1  (unsigned  halfword vectors)
   1500                  Return Type - signed halfword
   1501    Details     : Zero extended right half of vector is returned in 'out0'
   1502                  Zero extended left half of vector is returned in 'out1'
   1503 */
   1504 #define UNPCK_UB_SH(in, out0, out1)      \
   1505   {                                      \
   1506     v16i8 zero_m = { 0 };                \
   1507                                          \
   1508     ILVRL_B2_SH(zero_m, in, out0, out1); \
   1509   }
   1510 
   1511 /* Description : Sign extend halfword elements from input vector and return
   1512                  the result in pair of vectors
   1513    Arguments   : Input   - in            (halfword vector)
   1514                  Outputs - out0, out1   (sign extended word vectors)
   1515                  Return Type - signed word
   1516    Details     : Sign bit of halfword elements from input vector 'in' is
   1517                  extracted and interleaved right with same vector 'in0' to
   1518                  generate 4 signed word elements in 'out0'
   1519                  Then interleaved left with same vector 'in0' to
   1520                  generate 4 signed word elements in 'out1'
   1521 */
   1522 #define UNPCK_SH_SW(in, out0, out1)       \
   1523   {                                       \
   1524     v8i16 tmp_m;                          \
   1525                                           \
   1526     tmp_m = __msa_clti_s_h((v8i16)in, 0); \
   1527     ILVRL_H2_SW(tmp_m, in, out0, out1);   \
   1528   }
   1529 
   1530 /* Description : Butterfly of 4 input vectors
   1531    Arguments   : Inputs  - in0, in1, in2, in3
   1532                  Outputs - out0, out1, out2, out3
   1533    Details     : Butterfly operation
   1534 */
   1535 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
   1536   {                                                             \
   1537     out0 = in0 + in3;                                           \
   1538     out1 = in1 + in2;                                           \
   1539                                                                 \
   1540     out2 = in1 - in2;                                           \
   1541     out3 = in0 - in3;                                           \
   1542   }
   1543 
   1544 /* Description : Transpose input 8x8 byte block
   1545    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
   1546                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
   1547                  Return Type - as per RTYPE
   1548 */
   1549 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0,   \
   1550                         out1, out2, out3, out4, out5, out6, out7)              \
   1551   {                                                                            \
   1552     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
   1553     v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
   1554                                                                                \
   1555     ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
   1556                tmp3_m);                                                        \
   1557     ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                               \
   1558     ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                               \
   1559     ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                               \
   1560     ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                               \
   1561     SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                               \
   1562     SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                               \
   1563   }
   1564 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
   1565 
   1566 /* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
   1567    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
   1568                            in8, in9, in10, in11, in12, in13, in14, in15
   1569                  Outputs - out0, out1, out2, out3
   1570                  Return Type - unsigned byte
   1571 */
   1572 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
   1573                             in10, in11, in12, in13, in14, in15, out0, out1,   \
   1574                             out2, out3)                                       \
   1575   {                                                                           \
   1576     v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
   1577                                                                               \
   1578     ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m);                         \
   1579     out1 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m);                              \
   1580                                                                               \
   1581     ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                         \
   1582     out3 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m);                              \
   1583                                                                               \
   1584     ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m);                        \
   1585                                                                               \
   1586     tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                   \
   1587     ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                        \
   1588                                                                               \
   1589     tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                   \
   1590     ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);                  \
   1591     out0 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m);                \
   1592     out2 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                \
   1593                                                                               \
   1594     tmp0_m = (v2i64)__msa_ilvod_b((v16i8)out3, (v16i8)out1);                  \
   1595     tmp1_m = (v2i64)__msa_ilvod_b((v16i8)tmp3_m, (v16i8)tmp2_m);              \
   1596     out1 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m);                \
   1597     out3 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                \
   1598   }
   1599 
   1600 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
   1601    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
   1602                            in8, in9, in10, in11, in12, in13, in14, in15
   1603                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
   1604                  Return Type - unsigned byte
   1605 */
   1606 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
   1607                             in10, in11, in12, in13, in14, in15, out0, out1,   \
   1608                             out2, out3, out4, out5, out6, out7)               \
   1609   {                                                                           \
   1610     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
   1611     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                     \
   1612                                                                               \
   1613     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                              \
   1614     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                            \
   1615     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                            \
   1616     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                            \
   1617                                                                               \
   1618     tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                  \
   1619     tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                  \
   1620     tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                  \
   1621     tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                  \
   1622     out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                    \
   1623     tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                  \
   1624     out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                    \
   1625     tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                  \
   1626                                                                               \
   1627     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                  \
   1628     out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
   1629     out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
   1630                                                                               \
   1631     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);              \
   1632     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                  \
   1633     out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
   1634     out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
   1635                                                                               \
   1636     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);              \
   1637     out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
   1638     out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
   1639                                                                               \
   1640     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
   1641     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
   1642     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
   1643     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
   1644     out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
   1645     out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
   1646   }
   1647 
   1648 /* Description : Transpose 4x4 block with half word elements in vectors
   1649    Arguments   : Inputs  - in0, in1, in2, in3
   1650                  Outputs - out0, out1, out2, out3
   1651                  Return Type - signed halfword
   1652 */
   1653 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
   1654   {                                                                    \
   1655     v8i16 s0_m, s1_m;                                                  \
   1656                                                                        \
   1657     ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                        \
   1658     ILVRL_W2_SH(s1_m, s0_m, out0, out2);                               \
   1659     out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);              \
   1660     out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);              \
   1661   }
   1662 
   1663 /* Description : Transpose 8x4 block with half word elements in vectors
   1664    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
   1665                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
   1666                  Return Type - signed halfword
   1667 */
   1668 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
   1669   {                                                                    \
   1670     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
   1671                                                                        \
   1672     ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                    \
   1673     ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                    \
   1674     ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);            \
   1675     ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);            \
   1676   }
   1677 
   1678 /* Description : Transpose 4x4 block with word elements in vectors
   1679    Arguments   : Inputs  - in0, in1, in2, in3
   1680                  Outputs - out0, out1, out2, out3
   1681                  Return Type - signed word
   1682 */
   1683 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
   1684   {                                                                    \
   1685     v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
   1686                                                                        \
   1687     ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                 \
   1688     ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                 \
   1689                                                                        \
   1690     out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);              \
   1691     out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);              \
   1692     out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);              \
   1693     out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);              \
   1694   }
   1695 
   1696 /* Description : Dot product and addition of 3 signed halfword input vectors
   1697    Arguments   : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
   1698                  Output - out0_m
   1699                  Return Type - signed halfword
   1700    Details     : Dot product of 'in0' with 'coeff0'
   1701                  Dot product of 'in1' with 'coeff1'
   1702                  Dot product of 'in2' with 'coeff2'
   1703                  Addition of all the 3 vector results
   1704                  out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
   1705 */
   1706 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)      \
   1707   ({                                                             \
   1708     v8i16 tmp1_m;                                                \
   1709     v8i16 out0_m;                                                \
   1710                                                                  \
   1711     out0_m = __msa_dotp_s_h((v16i8)in0, (v16i8)coeff0);          \
   1712     out0_m = __msa_dpadd_s_h(out0_m, (v16i8)in1, (v16i8)coeff1); \
   1713     tmp1_m = __msa_dotp_s_h((v16i8)in2, (v16i8)coeff2);          \
   1714     out0_m = __msa_adds_s_h(out0_m, tmp1_m);                     \
   1715                                                                  \
   1716     out0_m;                                                      \
   1717   })
   1718 
   1719 /* Description : Pack even elements of input vectors & xor with 128
   1720    Arguments   : Inputs - in0, in1
   1721                  Output - out_m
   1722                  Return Type - unsigned byte
   1723    Details     : Signed byte even elements from 'in0' and 'in1' are packed
   1724                  together in one vector and the resulting vector is xor'ed with
   1725                  128 to shift the range from signed to unsigned byte
   1726 */
   1727 #define PCKEV_XORI128_UB(in0, in1)                        \
   1728   ({                                                      \
   1729     v16u8 out_m;                                          \
   1730     out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
   1731     out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);       \
   1732     out_m;                                                \
   1733   })
   1734 
   1735 /* Description : Pack even byte elements and store byte vector in destination
   1736                  memory
   1737    Arguments   : Inputs - in0, in1, pdst
   1738 */
   1739 #define PCKEV_ST_SB(in0, in1, pdst)                \
   1740   {                                                \
   1741     v16i8 tmp_m;                                   \
   1742     tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
   1743     ST_SB(tmp_m, (pdst));                          \
   1744   }
   1745 
   1746 /* Description : Horizontal 2 tap filter kernel code
   1747    Arguments   : Inputs - in0, in1, mask, coeff, shift
   1748 */
   1749 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)        \
   1750   ({                                                            \
   1751     v16i8 tmp0_m;                                               \
   1752     v8u16 tmp1_m;                                               \
   1753                                                                 \
   1754     tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
   1755     tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);       \
   1756     tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);        \
   1757                                                                 \
   1758     tmp1_m;                                                     \
   1759   })
   1760 #endif /* VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ */
   1761