Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #ifndef VPX_DSP_MIPS_MACROS_MSA_H_
     12 #define VPX_DSP_MIPS_MACROS_MSA_H_
     13 
     14 #include <msa.h>
     15 
     16 #include "./vpx_config.h"
     17 #include "vpx/vpx_integer.h"
     18 
     19 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
     20 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
     21 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
     22 
     23 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
     24 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
     25 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
     26 
     27 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
     28 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
     29 
     30 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
     31 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
     32 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
     33 
     34 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
     35 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
     36 
     37 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
     38 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
     39 
     40 #if (__mips_isa_rev >= 6)
     41 #define LH(psrc)                                          \
     42   ({                                                      \
     43     const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
     44     uint16_t val_m;                                       \
     45                                                           \
     46     __asm__ __volatile__("lh  %[val_m],  %[psrc_m]  \n\t" \
     47                                                           \
     48                          : [val_m] "=r"(val_m)            \
     49                          : [psrc_m] "m"(*psrc_m));        \
     50                                                           \
     51     val_m;                                                \
     52   })
     53 
     54 #define LW(psrc)                                          \
     55   ({                                                      \
     56     const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
     57     uint32_t val_m;                                       \
     58                                                           \
     59     __asm__ __volatile__("lw  %[val_m],  %[psrc_m]  \n\t" \
     60                                                           \
     61                          : [val_m] "=r"(val_m)            \
     62                          : [psrc_m] "m"(*psrc_m));        \
     63                                                           \
     64     val_m;                                                \
     65   })
     66 
     67 #if (__mips == 64)
     68 #define LD(psrc)                                          \
     69   ({                                                      \
     70     const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
     71     uint64_t val_m = 0;                                   \
     72                                                           \
     73     __asm__ __volatile__("ld  %[val_m],  %[psrc_m]  \n\t" \
     74                                                           \
     75                          : [val_m] "=r"(val_m)            \
     76                          : [psrc_m] "m"(*psrc_m));        \
     77                                                           \
     78     val_m;                                                \
     79   })
     80 #else  // !(__mips == 64)
     81 #define LD(psrc)                                            \
     82   ({                                                        \
     83     const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
     84     uint32_t val0_m, val1_m;                                \
     85     uint64_t val_m = 0;                                     \
     86                                                             \
     87     val0_m = LW(psrc_m);                                    \
     88     val1_m = LW(psrc_m + 4);                                \
     89                                                             \
     90     val_m = (uint64_t)(val1_m);                             \
     91     val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
     92     val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
     93                                                             \
     94     val_m;                                                  \
     95   })
     96 #endif  // (__mips == 64)
     97 
     98 #define SH(val, pdst)                                     \
     99   {                                                       \
    100     uint8_t *pdst_m = (uint8_t *)(pdst);                  \
    101     const uint16_t val_m = (val);                         \
    102                                                           \
    103     __asm__ __volatile__("sh  %[val_m],  %[pdst_m]  \n\t" \
    104                                                           \
    105                          : [pdst_m] "=m"(*pdst_m)         \
    106                          : [val_m] "r"(val_m));           \
    107   }
    108 
    109 #define SW(val, pdst)                                     \
    110   {                                                       \
    111     uint8_t *pdst_m = (uint8_t *)(pdst);                  \
    112     const uint32_t val_m = (val);                         \
    113                                                           \
    114     __asm__ __volatile__("sw  %[val_m],  %[pdst_m]  \n\t" \
    115                                                           \
    116                          : [pdst_m] "=m"(*pdst_m)         \
    117                          : [val_m] "r"(val_m));           \
    118   }
    119 
    120 #define SD(val, pdst)                                     \
    121   {                                                       \
    122     uint8_t *pdst_m = (uint8_t *)(pdst);                  \
    123     const uint64_t val_m = (val);                         \
    124                                                           \
    125     __asm__ __volatile__("sd  %[val_m],  %[pdst_m]  \n\t" \
    126                                                           \
    127                          : [pdst_m] "=m"(*pdst_m)         \
    128                          : [val_m] "r"(val_m));           \
    129   }
    130 #else  // !(__mips_isa_rev >= 6)
    131 #define LH(psrc)                                           \
    132   ({                                                       \
    133     const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
    134     uint16_t val_m;                                        \
    135                                                            \
    136     __asm__ __volatile__("ulh  %[val_m],  %[psrc_m]  \n\t" \
    137                                                            \
    138                          : [val_m] "=r"(val_m)             \
    139                          : [psrc_m] "m"(*psrc_m));         \
    140                                                            \
    141     val_m;                                                 \
    142   })
    143 
    144 #define LW(psrc)                                           \
    145   ({                                                       \
    146     const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
    147     uint32_t val_m;                                        \
    148                                                            \
    149     __asm__ __volatile__("ulw  %[val_m],  %[psrc_m]  \n\t" \
    150                                                            \
    151                          : [val_m] "=r"(val_m)             \
    152                          : [psrc_m] "m"(*psrc_m));         \
    153                                                            \
    154     val_m;                                                 \
    155   })
    156 
    157 #if (__mips == 64)
    158 #define LD(psrc)                                           \
    159   ({                                                       \
    160     const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
    161     uint64_t val_m = 0;                                    \
    162                                                            \
    163     __asm__ __volatile__("uld  %[val_m],  %[psrc_m]  \n\t" \
    164                                                            \
    165                          : [val_m] "=r"(val_m)             \
    166                          : [psrc_m] "m"(*psrc_m));         \
    167                                                            \
    168     val_m;                                                 \
    169   })
    170 #else  // !(__mips == 64)
    171 #define LD(psrc)                                                              \
    172   ({                                                                          \
    173     const uint8_t *psrc_m1 = (const uint8_t *)(psrc);                         \
    174     uint32_t val0_m, val1_m;                                                  \
    175     uint64_t val_m_combined = 0;                                              \
    176                                                                               \
    177     val0_m = LW(psrc_m1);                                                     \
    178     val1_m = LW(psrc_m1 + 4);                                                 \
    179                                                                               \
    180     val_m_combined = (uint64_t)(val1_m);                                      \
    181     val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \
    182     val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m);           \
    183                                                                               \
    184     val_m_combined;                                                           \
    185   })
    186 #endif  // (__mips == 64)
    187 
    188 #define SH(val, pdst)                                      \
    189   {                                                        \
    190     uint8_t *pdst_m = (uint8_t *)(pdst);                   \
    191     const uint16_t val_m = (val);                          \
    192                                                            \
    193     __asm__ __volatile__("ush  %[val_m],  %[pdst_m]  \n\t" \
    194                                                            \
    195                          : [pdst_m] "=m"(*pdst_m)          \
    196                          : [val_m] "r"(val_m));            \
    197   }
    198 
    199 #define SW(val, pdst)                                      \
    200   {                                                        \
    201     uint8_t *pdst_m = (uint8_t *)(pdst);                   \
    202     const uint32_t val_m = (val);                          \
    203                                                            \
    204     __asm__ __volatile__("usw  %[val_m],  %[pdst_m]  \n\t" \
    205                                                            \
    206                          : [pdst_m] "=m"(*pdst_m)          \
    207                          : [val_m] "r"(val_m));            \
    208   }
    209 
    210 #define SD(val, pdst)                                        \
    211   {                                                          \
    212     uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
    213     uint32_t val0_m, val1_m;                                 \
    214                                                              \
    215     val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
    216     val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
    217                                                              \
    218     SW(val0_m, pdst_m1);                                     \
    219     SW(val1_m, pdst_m1 + 4);                                 \
    220   }
    221 #endif  // (__mips_isa_rev >= 6)
    222 
    223 /* Description : Load 4 words with stride
    224    Arguments   : Inputs  - psrc, stride
    225                  Outputs - out0, out1, out2, out3
    226    Details     : Load word in 'out0' from (psrc)
    227                  Load word in 'out1' from (psrc + stride)
    228                  Load word in 'out2' from (psrc + 2 * stride)
    229                  Load word in 'out3' from (psrc + 3 * stride)
    230 */
    231 #define LW4(psrc, stride, out0, out1, out2, out3) \
    232   {                                               \
    233     out0 = LW((psrc));                            \
    234     out1 = LW((psrc) + stride);                   \
    235     out2 = LW((psrc) + 2 * stride);               \
    236     out3 = LW((psrc) + 3 * stride);               \
    237   }
    238 
    239 /* Description : Load double words with stride
    240    Arguments   : Inputs  - psrc, stride
    241                  Outputs - out0, out1
    242    Details     : Load double word in 'out0' from (psrc)
    243                  Load double word in 'out1' from (psrc + stride)
    244 */
    245 #define LD2(psrc, stride, out0, out1) \
    246   {                                   \
    247     out0 = LD((psrc));                \
    248     out1 = LD((psrc) + stride);       \
    249   }
    250 #define LD4(psrc, stride, out0, out1, out2, out3) \
    251   {                                               \
    252     LD2((psrc), stride, out0, out1);              \
    253     LD2((psrc) + 2 * stride, stride, out2, out3); \
    254   }
    255 
    256 /* Description : Store 4 words with stride
    257    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
    258    Details     : Store word from 'in0' to (pdst)
    259                  Store word from 'in1' to (pdst + stride)
    260                  Store word from 'in2' to (pdst + 2 * stride)
    261                  Store word from 'in3' to (pdst + 3 * stride)
    262 */
    263 #define SW4(in0, in1, in2, in3, pdst, stride) \
    264   {                                           \
    265     SW(in0, (pdst))                           \
    266     SW(in1, (pdst) + stride);                 \
    267     SW(in2, (pdst) + 2 * stride);             \
    268     SW(in3, (pdst) + 3 * stride);             \
    269   }
    270 
    271 /* Description : Store 4 double words with stride
    272    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
    273    Details     : Store double word from 'in0' to (pdst)
    274                  Store double word from 'in1' to (pdst + stride)
    275                  Store double word from 'in2' to (pdst + 2 * stride)
    276                  Store double word from 'in3' to (pdst + 3 * stride)
    277 */
    278 #define SD4(in0, in1, in2, in3, pdst, stride) \
    279   {                                           \
    280     SD(in0, (pdst))                           \
    281     SD(in1, (pdst) + stride);                 \
    282     SD(in2, (pdst) + 2 * stride);             \
    283     SD(in3, (pdst) + 3 * stride);             \
    284   }
    285 
    286 /* Description : Load vectors with 16 byte elements with stride
    287    Arguments   : Inputs  - psrc, stride
    288                  Outputs - out0, out1
    289                  Return Type - as per RTYPE
    290    Details     : Load 16 byte elements in 'out0' from (psrc)
    291                  Load 16 byte elements in 'out1' from (psrc + stride)
    292 */
    293 #define LD_B2(RTYPE, psrc, stride, out0, out1) \
    294   {                                            \
    295     out0 = LD_B(RTYPE, (psrc));                \
    296     out1 = LD_B(RTYPE, (psrc) + stride);       \
    297   }
    298 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
    299 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
    300 
    301 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
    302   {                                                  \
    303     LD_B2(RTYPE, (psrc), stride, out0, out1);        \
    304     out2 = LD_B(RTYPE, (psrc) + 2 * stride);         \
    305   }
    306 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
    307 
    308 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
    309   {                                                        \
    310     LD_B2(RTYPE, (psrc), stride, out0, out1);              \
    311     LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
    312   }
    313 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
    314 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
    315 
    316 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
    317   {                                                              \
    318     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
    319     out4 = LD_B(RTYPE, (psrc) + 4 * stride);                     \
    320   }
    321 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
    322 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
    323 
    324 #define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
    325   {                                                                          \
    326     LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);              \
    327     LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);                   \
    328   }
    329 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
    330 
    331 #define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
    332               out7)                                                          \
    333   {                                                                          \
    334     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
    335     LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
    336   }
    337 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
    338 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
    339 
    340 /* Description : Load vectors with 8 halfword elements with stride
    341    Arguments   : Inputs  - psrc, stride
    342                  Outputs - out0, out1
    343    Details     : Load 8 halfword elements in 'out0' from (psrc)
    344                  Load 8 halfword elements in 'out1' from (psrc + stride)
    345 */
    346 #define LD_H2(RTYPE, psrc, stride, out0, out1) \
    347   {                                            \
    348     out0 = LD_H(RTYPE, (psrc));                \
    349     out1 = LD_H(RTYPE, (psrc) + (stride));     \
    350   }
    351 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
    352 
    353 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
    354   {                                                        \
    355     LD_H2(RTYPE, (psrc), stride, out0, out1);              \
    356     LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
    357   }
    358 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
    359 
    360 #define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
    361               out7)                                                          \
    362   {                                                                          \
    363     LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
    364     LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
    365   }
    366 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
    367 
    368 #define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6,  \
    369                out7, out8, out9, out10, out11, out12, out13, out14, out15)     \
    370   {                                                                            \
    371     LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6,     \
    372           out7);                                                               \
    373     LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
    374           out13, out14, out15);                                                \
    375   }
    376 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
    377 
    378 /* Description : Load 4x4 block of signed halfword elements from 1D source
    379                  data into 4 vectors (Each vector with 4 signed halfwords)
    380    Arguments   : Input   - psrc
    381                  Outputs - out0, out1, out2, out3
    382 */
    383 #define LD4x4_SH(psrc, out0, out1, out2, out3)            \
    384   {                                                       \
    385     out0 = LD_SH(psrc);                                   \
    386     out2 = LD_SH(psrc + 8);                               \
    387     out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
    388     out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
    389   }
    390 
    391 /* Description : Load 2 vectors of signed word elements with stride
    392    Arguments   : Inputs  - psrc, stride
    393                  Outputs - out0, out1
    394                  Return Type - signed word
    395 */
    396 #define LD_SW2(psrc, stride, out0, out1) \
    397   {                                      \
    398     out0 = LD_SW((psrc));                \
    399     out1 = LD_SW((psrc) + stride);       \
    400   }
    401 
    402 /* Description : Store vectors of 16 byte elements with stride
    403    Arguments   : Inputs - in0, in1, pdst, stride
    404    Details     : Store 16 byte elements from 'in0' to (pdst)
    405                  Store 16 byte elements from 'in1' to (pdst + stride)
    406 */
    407 #define ST_B2(RTYPE, in0, in1, pdst, stride) \
    408   {                                          \
    409     ST_B(RTYPE, in0, (pdst));                \
    410     ST_B(RTYPE, in1, (pdst) + stride);       \
    411   }
    412 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
    413 
    414 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
    415   {                                                      \
    416     ST_B2(RTYPE, in0, in1, (pdst), stride);              \
    417     ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
    418   }
    419 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
    420 
    421 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
    422   {                                                                        \
    423     ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
    424     ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
    425   }
    426 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
    427 
    428 /* Description : Store vectors of 8 halfword elements with stride
    429    Arguments   : Inputs - in0, in1, pdst, stride
    430    Details     : Store 8 halfword elements from 'in0' to (pdst)
    431                  Store 8 halfword elements from 'in1' to (pdst + stride)
    432 */
    433 #define ST_H2(RTYPE, in0, in1, pdst, stride) \
    434   {                                          \
    435     ST_H(RTYPE, in0, (pdst));                \
    436     ST_H(RTYPE, in1, (pdst) + stride);       \
    437   }
    438 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
    439 
    440 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
    441   {                                                      \
    442     ST_H2(RTYPE, in0, in1, (pdst), stride);              \
    443     ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
    444   }
    445 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
    446 
    447 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
    448   {                                                                        \
    449     ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                      \
    450     ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
    451   }
    452 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
    453 
    454 /* Description : Store vectors of word elements with stride
    455    Arguments   : Inputs - in0, in1, pdst, stride
    456    Details     : Store 4 word elements from 'in0' to (pdst)
    457                  Store 4 word elements from 'in1' to (pdst + stride)
    458 */
    459 #define ST_SW2(in0, in1, pdst, stride) \
    460   {                                    \
    461     ST_SW(in0, (pdst));                \
    462     ST_SW(in1, (pdst) + stride);       \
    463   }
    464 
    465 /* Description : Store 2x4 byte block to destination memory from input vector
    466    Arguments   : Inputs - in, stidx, pdst, stride
    467    Details     : Index 'stidx' halfword element from 'in' vector is copied to
    468                  the GP register and stored to (pdst)
    469                  Index 'stidx+1' halfword element from 'in' vector is copied to
    470                  the GP register and stored to (pdst + stride)
    471                  Index 'stidx+2' halfword element from 'in' vector is copied to
    472                  the GP register and stored to (pdst + 2 * stride)
    473                  Index 'stidx+3' halfword element from 'in' vector is copied to
    474                  the GP register and stored to (pdst + 3 * stride)
    475 */
    476 #define ST2x4_UB(in, stidx, pdst, stride)            \
    477   {                                                  \
    478     uint16_t out0_m, out1_m, out2_m, out3_m;         \
    479     uint8_t *pblk_2x4_m = (uint8_t *)(pdst);         \
    480                                                      \
    481     out0_m = __msa_copy_u_h((v8i16)in, (stidx));     \
    482     out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
    483     out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
    484     out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
    485                                                      \
    486     SH(out0_m, pblk_2x4_m);                          \
    487     SH(out1_m, pblk_2x4_m + stride);                 \
    488     SH(out2_m, pblk_2x4_m + 2 * stride);             \
    489     SH(out3_m, pblk_2x4_m + 3 * stride);             \
    490   }
    491 
    492 /* Description : Store 4x2 byte block to destination memory from input vector
    493    Arguments   : Inputs - in, pdst, stride
    494    Details     : Index 0 word element from 'in' vector is copied to the GP
    495                  register and stored to (pdst)
    496                  Index 1 word element from 'in' vector is copied to the GP
    497                  register and stored to (pdst + stride)
    498 */
    499 #define ST4x2_UB(in, pdst, stride)           \
    500   {                                          \
    501     uint32_t out0_m, out1_m;                 \
    502     uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
    503                                              \
    504     out0_m = __msa_copy_u_w((v4i32)in, 0);   \
    505     out1_m = __msa_copy_u_w((v4i32)in, 1);   \
    506                                              \
    507     SW(out0_m, pblk_4x2_m);                  \
    508     SW(out1_m, pblk_4x2_m + stride);         \
    509   }
    510 
    511 /* Description : Store 4x4 byte block to destination memory from input vector
    512    Arguments   : Inputs - in0, in1, pdst, stride
    513    Details     : 'Idx0' word element from input vector 'in0' is copied to the
    514                  GP register and stored to (pdst)
    515                  'Idx1' word element from input vector 'in0' is copied to the
    516                  GP register and stored to (pdst + stride)
    517                  'Idx2' word element from input vector 'in0' is copied to the
    518                  GP register and stored to (pdst + 2 * stride)
    519                  'Idx3' word element from input vector 'in0' is copied to the
    520                  GP register and stored to (pdst + 3 * stride)
    521 */
    522 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
    523   {                                                              \
    524     uint32_t out0_m, out1_m, out2_m, out3_m;                     \
    525     uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                     \
    526                                                                  \
    527     out0_m = __msa_copy_u_w((v4i32)in0, idx0);                   \
    528     out1_m = __msa_copy_u_w((v4i32)in0, idx1);                   \
    529     out2_m = __msa_copy_u_w((v4i32)in1, idx2);                   \
    530     out3_m = __msa_copy_u_w((v4i32)in1, idx3);                   \
    531                                                                  \
    532     SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);     \
    533   }
    534 #define ST4x8_UB(in0, in1, pdst, stride)                           \
    535   {                                                                \
    536     uint8_t *pblk_4x8 = (uint8_t *)(pdst);                         \
    537                                                                    \
    538     ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);              \
    539     ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
    540   }
    541 
    542 /* Description : Store 8x1 byte block to destination memory from input vector
    543    Arguments   : Inputs - in, pdst
    544    Details     : Index 0 double word element from 'in' vector is copied to the
    545                  GP register and stored to (pdst)
    546 */
    547 #define ST8x1_UB(in, pdst)                 \
    548   {                                        \
    549     uint64_t out0_m;                       \
    550                                            \
    551     out0_m = __msa_copy_u_d((v2i64)in, 0); \
    552     SD(out0_m, pdst);                      \
    553   }
    554 
    555 /* Description : Store 8x2 byte block to destination memory from input vector
    556    Arguments   : Inputs - in, pdst, stride
    557    Details     : Index 0 double word element from 'in' vector is copied to the
    558                  GP register and stored to (pdst)
    559                  Index 1 double word element from 'in' vector is copied to the
    560                  GP register and stored to (pdst + stride)
    561 */
    562 #define ST8x2_UB(in, pdst, stride)           \
    563   {                                          \
    564     uint64_t out0_m, out1_m;                 \
    565     uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
    566                                              \
    567     out0_m = __msa_copy_u_d((v2i64)in, 0);   \
    568     out1_m = __msa_copy_u_d((v2i64)in, 1);   \
    569                                              \
    570     SD(out0_m, pblk_8x2_m);                  \
    571     SD(out1_m, pblk_8x2_m + stride);         \
    572   }
    573 
    574 /* Description : Store 8x4 byte block to destination memory from input
    575                  vectors
    576    Arguments   : Inputs - in0, in1, pdst, stride
    577    Details     : Index 0 double word element from 'in0' vector is copied to the
    578                  GP register and stored to (pdst)
    579                  Index 1 double word element from 'in0' vector is copied to the
    580                  GP register and stored to (pdst + stride)
    581                  Index 0 double word element from 'in1' vector is copied to the
    582                  GP register and stored to (pdst + 2 * stride)
    583                  Index 1 double word element from 'in1' vector is copied to the
    584                  GP register and stored to (pdst + 3 * stride)
    585 */
    586 #define ST8x4_UB(in0, in1, pdst, stride)                     \
    587   {                                                          \
    588     uint64_t out0_m, out1_m, out2_m, out3_m;                 \
    589     uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                 \
    590                                                              \
    591     out0_m = __msa_copy_u_d((v2i64)in0, 0);                  \
    592     out1_m = __msa_copy_u_d((v2i64)in0, 1);                  \
    593     out2_m = __msa_copy_u_d((v2i64)in1, 0);                  \
    594     out3_m = __msa_copy_u_d((v2i64)in1, 1);                  \
    595                                                              \
    596     SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
    597   }
    598 
    599 /* Description : average with rounding (in0 + in1 + 1) / 2.
    600    Arguments   : Inputs  - in0, in1, in2, in3,
    601                  Outputs - out0, out1
    602                  Return Type - as per RTYPE
    603    Details     : Each unsigned byte element from 'in0' vector is added with
    604                  each unsigned byte element from 'in1' vector. Then the average
    605                  with rounding is calculated and written to 'out0'
    606 */
    607 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)   \
    608   {                                                       \
    609     out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
    610     out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
    611   }
    612 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
    613 
    614 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
    615                  out2, out3)                                                \
    616   {                                                                         \
    617     AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                         \
    618     AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                         \
    619   }
    620 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
    621 
    622 /* Description : Immediate number of elements to slide with zero
    623    Arguments   : Inputs  - in0, in1, slide_val
    624                  Outputs - out0, out1
    625                  Return Type - as per RTYPE
    626    Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
    627                  value specified in the 'slide_val'
    628 */
    629 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)             \
    630   {                                                                   \
    631     v16i8 zero_m = { 0 };                                             \
    632     out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
    633     out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
    634   }
    635 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
    636 
    637 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \
    638                   slide_val)                                         \
    639   {                                                                  \
    640     SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);               \
    641     SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);               \
    642   }
    643 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
    644 
    645 /* Description : Immediate number of elements to slide
    646    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
    647                  Outputs - out0, out1
    648                  Return Type - as per RTYPE
    649    Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
    650                  value specified in the 'slide_val'
    651 */
    652 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
    653   {                                                                       \
    654     out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);    \
    655     out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);    \
    656   }
    657 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
    658 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
    659 
    660 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
    661                 out2, slide_val)                                             \
    662   {                                                                          \
    663     SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)        \
    664     out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);       \
    665   }
    666 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
    667 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
    668 
    669 /* Description : Shuffle byte vector elements as per mask vector
    670    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
    671                  Outputs - out0, out1
    672                  Return Type - as per RTYPE
    673    Details     : Byte elements from 'in0' & 'in1' are copied selectively to
    674                  'out0' as per control vector 'mask0'
    675 */
    676 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
    677   {                                                                   \
    678     out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
    679     out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
    680   }
    681 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
    682 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
    683 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
    684 
    685 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
    686                 out3)                                                          \
    687   {                                                                            \
    688     VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);              \
    689     VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);              \
    690   }
    691 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
    692 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
    693 
    694 /* Description : Dot product of byte vector elements
    695    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    696                  Outputs - out0, out1
    697                  Return Type - as per RTYPE
    698    Details     : Unsigned byte elements from 'mult0' are multiplied with
    699                  unsigned byte elements from 'cnst0' producing a result
    700                  twice the size of input i.e. unsigned halfword.
    701                  The multiplication result of adjacent odd-even elements
    702                  are added together and written to the 'out0' vector
    703 */
    704 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
    705   {                                                             \
    706     out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);   \
    707     out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);   \
    708   }
    709 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
    710 
    711 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
    712                  cnst3, out0, out1, out2, out3)                          \
    713   {                                                                      \
    714     DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
    715     DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
    716   }
    717 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
    718 
    719 /* Description : Dot product of byte vector elements
    720    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    721                  Outputs - out0, out1
    722                  Return Type - as per RTYPE
    723    Details     : Signed byte elements from 'mult0' are multiplied with
    724                  signed byte elements from 'cnst0' producing a result
    725                  twice the size of input i.e. signed halfword.
    726                  The multiplication result of adjacent odd-even elements
    727                  are added together and written to the 'out0' vector
    728 */
    729 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
    730   {                                                             \
    731     out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);   \
    732     out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);   \
    733   }
    734 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
    735 
    736 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
    737                  cnst3, out0, out1, out2, out3)                          \
    738   {                                                                      \
    739     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
    740     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
    741   }
    742 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
    743 
    744 /* Description : Dot product of halfword vector elements
    745    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    746                  Outputs - out0, out1
    747                  Return Type - as per RTYPE
    748    Details     : Signed halfword elements from 'mult0' are multiplied with
    749                  signed halfword elements from 'cnst0' producing a result
    750                  twice the size of input i.e. signed word.
    751                  The multiplication result of adjacent odd-even elements
    752                  are added together and written to the 'out0' vector
    753 */
    754 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
    755   {                                                             \
    756     out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);   \
    757     out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);   \
    758   }
    759 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
    760 
    761 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
    762                  cnst3, out0, out1, out2, out3)                          \
    763   {                                                                      \
    764     DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
    765     DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
    766   }
    767 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
    768 
    769 /* Description : Dot product of word vector elements
    770    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    771                  Outputs - out0, out1
    772                  Return Type - as per RTYPE
    773    Details     : Signed word elements from 'mult0' are multiplied with
    774                  signed word elements from 'cnst0' producing a result
    775                  twice the size of input i.e. signed double word.
    776                  The multiplication result of adjacent odd-even elements
    777                  are added together and written to the 'out0' vector
    778 */
    779 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
    780   {                                                             \
    781     out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);   \
    782     out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);   \
    783   }
    784 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
    785 
    786 /* Description : Dot product & addition of byte vector elements
    787    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    788                  Outputs - out0, out1
    789                  Return Type - as per RTYPE
    790    Details     : Signed byte elements from 'mult0' are multiplied with
    791                  signed byte elements from 'cnst0' producing a result
    792                  twice the size of input i.e. signed halfword.
    793                  The multiplication result of adjacent odd-even elements
    794                  are added to the 'out0' vector
    795 */
    796 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
    797   {                                                                         \
    798     out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
    799     out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
    800   }
    801 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
    802 
    803 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
    804                   cnst3, out0, out1, out2, out3)                          \
    805   {                                                                       \
    806     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
    807     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
    808   }
    809 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
    810 
    811 /* Description : Dot product & addition of halfword vector elements
    812    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    813                  Outputs - out0, out1
    814                  Return Type - as per RTYPE
    815    Details     : Signed halfword elements from 'mult0' are multiplied with
    816                  signed halfword elements from 'cnst0' producing a result
    817                  twice the size of input i.e. signed word.
    818                  The multiplication result of adjacent odd-even elements
    819                  are added to the 'out0' vector
    820 */
    821 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
    822   {                                                                         \
    823     out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
    824     out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
    825   }
    826 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
    827 
    828 /* Description : Dot product & addition of double word vector elements
    829    Arguments   : Inputs  - mult0, mult1
    830                  Outputs - out0, out1
    831                  Return Type - as per RTYPE
    832    Details     : Each signed word element from 'mult0' is multiplied with itself
    833                  producing an intermediate result twice the size of input
    834                  i.e. signed double word
    835                  The multiplication result of adjacent odd-even elements
    836                  are added to the 'out0' vector
    837 */
    838 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1)                          \
    839   {                                                                         \
    840     out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
    841     out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
    842   }
    843 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
    844 
    845 /* Description : Minimum values between unsigned elements of
    846                  either vector are copied to the output vector
    847    Arguments   : Inputs  - in0, in1, min_vec
    848                  Outputs - in place operation
    849                  Return Type - as per RTYPE
    850    Details     : Minimum of unsigned halfword element values from 'in0' and
    851                  'min_vec' are written to output vector 'in0'
    852 */
    853 #define MIN_UH2(RTYPE, in0, in1, min_vec)            \
    854   {                                                  \
    855     in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
    856     in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
    857   }
    858 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
    859 
    860 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
    861   {                                                 \
    862     MIN_UH2(RTYPE, in0, in1, min_vec);              \
    863     MIN_UH2(RTYPE, in2, in3, min_vec);              \
    864   }
    865 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
    866 
    867 /* Description : Clips all signed halfword elements of input vector
    868                  between 0 & 255
    869    Arguments   : Input  - in
    870                  Output - out_m
    871                  Return Type - signed halfword
    872 */
    873 #define CLIP_SH_0_255(in)                              \
    874   ({                                                   \
    875     v8i16 max_m = __msa_ldi_h(255);                    \
    876     v8i16 out_m;                                       \
    877                                                        \
    878     out_m = __msa_maxi_s_h((v8i16)in, 0);              \
    879     out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
    880     out_m;                                             \
    881   })
    882 #define CLIP_SH2_0_255(in0, in1) \
    883   {                              \
    884     in0 = CLIP_SH_0_255(in0);    \
    885     in1 = CLIP_SH_0_255(in1);    \
    886   }
    887 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
    888   {                                        \
    889     CLIP_SH2_0_255(in0, in1);              \
    890     CLIP_SH2_0_255(in2, in3);              \
    891   }
    892 
    893 /* Description : Horizontal addition of 4 signed word elements of input vector
    894    Arguments   : Input  - in       (signed word vector)
    895                  Output - sum_m    (i32 sum)
    896                  Return Type - signed word (GP)
    897    Details     : 4 signed word elements of 'in' vector are added together and
    898                  the resulting integer sum is returned
    899 */
    900 #define HADD_SW_S32(in)                            \
    901   ({                                               \
    902     v2i64 res0_m, res1_m;                          \
    903     int32_t sum_m;                                 \
    904                                                    \
    905     res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
    906     res1_m = __msa_splati_d(res0_m, 1);            \
    907     res0_m = res0_m + res1_m;                      \
    908     sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
    909     sum_m;                                         \
    910   })
    911 
    912 /* Description : Horizontal addition of 4 unsigned word elements
    913    Arguments   : Input  - in       (unsigned word vector)
    914                  Output - sum_m    (u32 sum)
    915                  Return Type - unsigned word (GP)
    916    Details     : 4 unsigned word elements of 'in' vector are added together and
    917                  the resulting integer sum is returned
    918 */
    919 #define HADD_UW_U32(in)                               \
    920   ({                                                  \
    921     v2u64 res0_m, res1_m;                             \
    922     uint32_t sum_m;                                   \
    923                                                       \
    924     res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in);    \
    925     res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
    926     res0_m += res1_m;                                 \
    927     sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
    928     sum_m;                                            \
    929   })
    930 
    931 /* Description : Horizontal addition of 8 unsigned halfword elements
    932    Arguments   : Input  - in       (unsigned halfword vector)
    933                  Output - sum_m    (u32 sum)
    934                  Return Type - unsigned word
    935    Details     : 8 unsigned halfword elements of 'in' vector are added
    936                  together and the resulting integer sum is returned
    937 */
    938 #define HADD_UH_U32(in)                           \
    939   ({                                              \
    940     v4u32 res_m;                                  \
    941     uint32_t sum_m;                               \
    942                                                   \
    943     res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
    944     sum_m = HADD_UW_U32(res_m);                   \
    945     sum_m;                                        \
    946   })
    947 
    948 /* Description : Horizontal addition of unsigned byte vector elements
    949    Arguments   : Inputs  - in0, in1
    950                  Outputs - out0, out1
    951                  Return Type - as per RTYPE
    952    Details     : Each unsigned odd byte element from 'in0' is added to
    953                  even unsigned byte element from 'in0' (pairwise) and the
    954                  halfword result is written to 'out0'
    955 */
    956 #define HADD_UB2(RTYPE, in0, in1, out0, out1)             \
    957   {                                                       \
    958     out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
    959     out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
    960   }
    961 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
    962 
    963 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
    964   {                                                                 \
    965     HADD_UB2(RTYPE, in0, in1, out0, out1);                          \
    966     HADD_UB2(RTYPE, in2, in3, out2, out3);                          \
    967   }
    968 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
    969 
    970 /* Description : Horizontal subtraction of unsigned byte vector elements
    971    Arguments   : Inputs  - in0, in1
    972                  Outputs - out0, out1
    973                  Return Type - as per RTYPE
    974    Details     : Each unsigned odd byte element from 'in0' is subtracted from
    975                  even unsigned byte element from 'in0' (pairwise) and the
    976                  halfword result is written to 'out0'
    977 */
    978 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)             \
    979   {                                                       \
    980     out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
    981     out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
    982   }
    983 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
    984 
    985 /* Description : SAD (Sum of Absolute Difference)
    986    Arguments   : Inputs  - in0, in1, ref0, ref1
    987                  Outputs - sad_m                 (halfword vector)
    988                  Return Type - unsigned halfword
    989    Details     : Absolute difference of all the byte elements from 'in0' with
    990                  'ref0' is calculated and preserved in 'diff0'. Then even-odd
    991                  pairs are added together to generate 8 halfword results.
    992 */
    993 #define SAD_UB2_UH(in0, in1, ref0, ref1)                     \
    994   ({                                                         \
    995     v16u8 diff0_m, diff1_m;                                  \
    996     v8u16 sad_m = { 0 };                                     \
    997                                                              \
    998     diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);       \
    999     diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);       \
   1000                                                              \
   1001     sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \
   1002     sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \
   1003                                                              \
   1004     sad_m;                                                   \
   1005   })
   1006 
   1007 /* Description : Horizontal subtraction of signed halfword vector elements
   1008    Arguments   : Inputs  - in0, in1
   1009                  Outputs - out0, out1
   1010                  Return Type - as per RTYPE
   1011    Details     : Each signed odd halfword element from 'in0' is subtracted from
   1012                  even signed halfword element from 'in0' (pairwise) and the
   1013                  word result is written to 'out0'
   1014 */
   1015 #define HSUB_UH2(RTYPE, in0, in1, out0, out1)             \
   1016   {                                                       \
   1017     out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
   1018     out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
   1019   }
   1020 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
   1021 
   1022 /* Description : Set element n input vector to GPR value
   1023    Arguments   : Inputs - in0, in1, in2, in3
   1024                  Output - out
   1025                  Return Type - as per RTYPE
   1026    Details     : Set element 0 in vector 'out' to value specified in 'in0'
   1027 */
   1028 #define INSERT_W2(RTYPE, in0, in1, out)              \
   1029   {                                                  \
   1030     out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
   1031     out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
   1032   }
   1033 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
   1034 
   1035 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out)    \
   1036   {                                                  \
   1037     out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
   1038     out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
   1039     out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
   1040     out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
   1041   }
   1042 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
   1043 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
   1044 
   1045 #define INSERT_D2(RTYPE, in0, in1, out)              \
   1046   {                                                  \
   1047     out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
   1048     out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
   1049   }
   1050 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
   1051 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
   1052 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
   1053 
   1054 /* Description : Interleave even byte elements from vectors
   1055    Arguments   : Inputs  - in0, in1, in2, in3
   1056                  Outputs - out0, out1
   1057                  Return Type - as per RTYPE
   1058    Details     : Even byte elements of 'in0' and 'in1' are interleaved
   1059                  and written to 'out0'
   1060 */
   1061 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1062   {                                                      \
   1063     out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
   1064     out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
   1065   }
   1066 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
   1067 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
   1068 
   1069 /* Description : Interleave even halfword elements from vectors
   1070    Arguments   : Inputs  - in0, in1, in2, in3
   1071                  Outputs - out0, out1
   1072                  Return Type - as per RTYPE
   1073    Details     : Even halfword elements of 'in0' and 'in1' are interleaved
   1074                  and written to 'out0'
   1075 */
   1076 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1077   {                                                      \
   1078     out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
   1079     out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
   1080   }
   1081 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
   1082 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
   1083 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
   1084 
   1085 /* Description : Interleave even word elements from vectors
   1086    Arguments   : Inputs  - in0, in1, in2, in3
   1087                  Outputs - out0, out1
   1088                  Return Type - as per RTYPE
   1089    Details     : Even word elements of 'in0' and 'in1' are interleaved
   1090                  and written to 'out0'
   1091 */
   1092 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1093   {                                                      \
   1094     out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
   1095     out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
   1096   }
   1097 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
   1098 
   1099 /* Description : Interleave even double word elements from vectors
   1100    Arguments   : Inputs  - in0, in1, in2, in3
   1101                  Outputs - out0, out1
   1102                  Return Type - as per RTYPE
   1103    Details     : Even double word elements of 'in0' and 'in1' are interleaved
   1104                  and written to 'out0'
   1105 */
   1106 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1107   {                                                      \
   1108     out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
   1109     out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
   1110   }
   1111 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
   1112 
   1113 /* Description : Interleave left half of byte elements from vectors
   1114    Arguments   : Inputs  - in0, in1, in2, in3
   1115                  Outputs - out0, out1
   1116                  Return Type - as per RTYPE
   1117    Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
   1118                  and written to 'out0'.
   1119 */
   1120 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1121   {                                                     \
   1122     out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
   1123     out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
   1124   }
   1125 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
   1126 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
   1127 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
   1128 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
   1129 
   1130 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
   1131                 out2, out3)                                                \
   1132   {                                                                        \
   1133     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
   1134     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
   1135   }
   1136 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
   1137 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
   1138 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
   1139 
   1140 /* Description : Interleave left half of halfword elements from vectors
   1141    Arguments   : Inputs  - in0, in1, in2, in3
   1142                  Outputs - out0, out1
   1143                  Return Type - as per RTYPE
   1144    Details     : Left half of halfword elements of 'in0' and 'in1' are
   1145                  interleaved and written to 'out0'.
   1146 */
   1147 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1148   {                                                     \
   1149     out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
   1150     out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
   1151   }
   1152 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
   1153 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
   1154 
   1155 /* Description : Interleave left half of word elements from vectors
   1156    Arguments   : Inputs  - in0, in1, in2, in3
   1157                  Outputs - out0, out1
   1158                  Return Type - as per RTYPE
   1159    Details     : Left half of word elements of 'in0' and 'in1' are interleaved
   1160                  and written to 'out0'.
   1161 */
   1162 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1163   {                                                     \
   1164     out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
   1165     out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
   1166   }
   1167 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
   1168 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
   1169 
   1170 /* Description : Interleave right half of byte elements from vectors
   1171    Arguments   : Inputs  - in0, in1, in2, in3
   1172                  Outputs - out0, out1
   1173                  Return Type - as per RTYPE
   1174    Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
   1175                  and written to out0.
   1176 */
   1177 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1178   {                                                     \
   1179     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
   1180     out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
   1181   }
   1182 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
   1183 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
   1184 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
   1185 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
   1186 
   1187 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
   1188                 out2, out3)                                                \
   1189   {                                                                        \
   1190     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
   1191     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
   1192   }
   1193 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
   1194 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
   1195 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
   1196 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
   1197 
   1198 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
   1199                 in11, in12, in13, in14, in15, out0, out1, out2, out3, out4,    \
   1200                 out5, out6, out7)                                              \
   1201   {                                                                            \
   1202     ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,   \
   1203             out3);                                                             \
   1204     ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5,   \
   1205             out6, out7);                                                       \
   1206   }
   1207 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
   1208 
   1209 /* Description : Interleave right half of halfword elements from vectors
   1210    Arguments   : Inputs  - in0, in1, in2, in3
   1211                  Outputs - out0, out1
   1212                  Return Type - as per RTYPE
   1213    Details     : Right half of halfword elements of 'in0' and 'in1' are
   1214                  interleaved and written to 'out0'.
   1215 */
   1216 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1217   {                                                     \
   1218     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
   1219     out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
   1220   }
   1221 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
   1222 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
   1223 
   1224 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
   1225                 out2, out3)                                                \
   1226   {                                                                        \
   1227     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
   1228     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
   1229   }
   1230 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
   1231 
   1232 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1233   {                                                     \
   1234     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
   1235     out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
   1236   }
   1237 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
   1238 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
   1239 
   1240 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
   1241                 out2, out3)                                                \
   1242   {                                                                        \
   1243     ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
   1244     ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
   1245   }
   1246 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
   1247 
   1248 /* Description : Interleave right half of double word elements from vectors
   1249    Arguments   : Inputs  - in0, in1, in2, in3
   1250                  Outputs - out0, out1
   1251                  Return Type - as per RTYPE
   1252    Details     : Right half of double word elements of 'in0' and 'in1' are
   1253                  interleaved and written to 'out0'.
   1254 */
   1255 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
   1256   {                                                         \
   1257     out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
   1258     out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
   1259   }
   1260 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
   1261 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
   1262 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
   1263 
   1264 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
   1265   {                                                                    \
   1266     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                    \
   1267     out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));            \
   1268   }
   1269 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
   1270 
   1271 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
   1272                 out2, out3)                                                \
   1273   {                                                                        \
   1274     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
   1275     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
   1276   }
   1277 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
   1278 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
   1279 
   1280 /* Description : Interleave both left and right half of input vectors
   1281    Arguments   : Inputs  - in0, in1
   1282                  Outputs - out0, out1
   1283                  Return Type - as per RTYPE
   1284    Details     : Right half of byte elements from 'in0' and 'in1' are
   1285                  interleaved and written to 'out0'
   1286 */
   1287 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
   1288   {                                                     \
   1289     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
   1290     out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
   1291   }
   1292 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
   1293 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
   1294 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
   1295 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
   1296 
   1297 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)           \
   1298   {                                                     \
   1299     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
   1300     out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
   1301   }
   1302 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
   1303 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
   1304 
   1305 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)           \
   1306   {                                                     \
   1307     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
   1308     out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
   1309   }
   1310 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
   1311 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
   1312 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
   1313 
   1314 /* Description : Saturate the halfword element values to the max
   1315                  unsigned value of (sat_val + 1) bits
   1316                  The element data width remains unchanged
   1317    Arguments   : Inputs  - in0, in1, sat_val
   1318                  Outputs - in place operation
   1319                  Return Type - as per RTYPE
   1320    Details     : Each unsigned halfword element from 'in0' is saturated to the
   1321                  value generated with (sat_val + 1) bit range.
   1322                  The results are written in place
   1323 */
   1324 #define SAT_UH2(RTYPE, in0, in1, sat_val)            \
   1325   {                                                  \
   1326     in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
   1327     in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
   1328   }
   1329 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
   1330 
   1331 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
   1332   {                                                 \
   1333     SAT_UH2(RTYPE, in0, in1, sat_val);              \
   1334     SAT_UH2(RTYPE, in2, in3, sat_val)               \
   1335   }
   1336 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
   1337 
   1338 /* Description : Saturate the halfword element values to the max
   1339                  unsigned value of (sat_val + 1) bits
   1340                  The element data width remains unchanged
   1341    Arguments   : Inputs  - in0, in1, sat_val
   1342                  Outputs - in place operation
   1343                  Return Type - as per RTYPE
   1344    Details     : Each unsigned halfword element from 'in0' is saturated to the
   1345                  value generated with (sat_val + 1) bit range
   1346                  The results are written in place
   1347 */
   1348 #define SAT_SH2(RTYPE, in0, in1, sat_val)            \
   1349   {                                                  \
   1350     in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
   1351     in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
   1352   }
   1353 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
   1354 
   1355 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
   1356   {                                                 \
   1357     SAT_SH2(RTYPE, in0, in1, sat_val);              \
   1358     SAT_SH2(RTYPE, in2, in3, sat_val);              \
   1359   }
   1360 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
   1361 
   1362 /* Description : Indexed halfword element values are replicated to all
   1363                  elements in output vector
   1364    Arguments   : Inputs  - in, idx0, idx1
   1365                  Outputs - out0, out1
   1366                  Return Type - as per RTYPE
   1367    Details     : 'idx0' element value from 'in' vector is replicated to all
   1368                   elements in 'out0' vector
   1369                   Valid index range for halfword operation is 0-7
   1370 */
   1371 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
   1372   {                                                  \
   1373     out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);   \
   1374     out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);   \
   1375   }
   1376 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
   1377 
   1378 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \
   1379   {                                                                          \
   1380     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);                            \
   1381     SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);                            \
   1382   }
   1383 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
   1384 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
   1385 
   1386 /* Description : Pack even byte elements of vector pairs
   1387    Arguments   : Inputs  - in0, in1, in2, in3
   1388                  Outputs - out0, out1
   1389                  Return Type - as per RTYPE
   1390    Details     : Even byte elements of 'in0' are copied to the left half of
   1391                  'out0' & even byte elements of 'in1' are copied to the right
   1392                  half of 'out0'.
   1393 */
   1394 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1395   {                                                      \
   1396     out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
   1397     out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
   1398   }
   1399 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
   1400 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
   1401 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
   1402 
   1403 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
   1404                  out2, out3)                                                \
   1405   {                                                                         \
   1406     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
   1407     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
   1408   }
   1409 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
   1410 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
   1411 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
   1412 
   1413 /* Description : Pack even halfword elements of vector pairs
   1414    Arguments   : Inputs  - in0, in1, in2, in3
   1415                  Outputs - out0, out1
   1416                  Return Type - as per RTYPE
   1417    Details     : Even halfword elements of 'in0' are copied to the left half of
   1418                  'out0' & even halfword elements of 'in1' are copied to the
   1419                  right half of 'out0'.
   1420 */
   1421 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1422   {                                                      \
   1423     out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
   1424     out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
   1425   }
   1426 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
   1427 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
   1428 
   1429 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
   1430                  out2, out3)                                                \
   1431   {                                                                         \
   1432     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
   1433     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
   1434   }
   1435 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
   1436 
   1437 /* Description : Pack even double word elements of vector pairs
   1438    Arguments   : Inputs  - in0, in1, in2, in3
   1439                  Outputs - out0, out1
   1440                  Return Type - as per RTYPE
   1441    Details     : Even double elements of 'in0' are copied to the left half of
   1442                  'out0' & even double elements of 'in1' are copied to the right
   1443                  half of 'out0'.
   1444 */
   1445 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
   1446   {                                                      \
   1447     out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
   1448     out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
   1449   }
   1450 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
   1451 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
   1452 
   1453 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
   1454                  out2, out3)                                                \
   1455   {                                                                         \
   1456     PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
   1457     PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
   1458   }
   1459 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
   1460 
   1461 /* Description : Each byte element is logically xor'ed with immediate 128
   1462    Arguments   : Inputs  - in0, in1
   1463                  Outputs - in place operation
   1464                  Return Type - as per RTYPE
   1465    Details     : Each unsigned byte element from input vector 'in0' is
   1466                  logically xor'ed with 128 and the result is stored in-place.
   1467 */
   1468 #define XORI_B2_128(RTYPE, in0, in1)            \
   1469   {                                             \
   1470     in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
   1471     in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
   1472   }
   1473 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
   1474 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
   1475 
   1476 #define XORI_B3_128(RTYPE, in0, in1, in2)       \
   1477   {                                             \
   1478     XORI_B2_128(RTYPE, in0, in1);               \
   1479     in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
   1480   }
   1481 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
   1482 
   1483 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
   1484   {                                            \
   1485     XORI_B2_128(RTYPE, in0, in1);              \
   1486     XORI_B2_128(RTYPE, in2, in3);              \
   1487   }
   1488 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
   1489 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
   1490 
   1491 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
   1492   {                                                           \
   1493     XORI_B4_128(RTYPE, in0, in1, in2, in3);                   \
   1494     XORI_B3_128(RTYPE, in4, in5, in6);                        \
   1495   }
   1496 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
   1497 
   1498 /* Description : Average of signed halfword elements -> (a + b) / 2
   1499    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
   1500                  Outputs - out0, out1, out2, out3
   1501                  Return Type - as per RTYPE
   1502    Details     : Each signed halfword element from 'in0' is added to each
   1503                  signed halfword element of 'in1' with full precision resulting
   1504                  in one extra bit in the result. The result is then divided by
   1505                  2 and written to 'out0'
   1506 */
   1507 #define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
   1508                 out2, out3)                                                \
   1509   {                                                                        \
   1510     out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);                   \
   1511     out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);                   \
   1512     out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);                   \
   1513     out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);                   \
   1514   }
   1515 #define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
   1516 
   1517 /* Description : Addition of signed halfword elements and signed saturation
   1518    Arguments   : Inputs  - in0, in1, in2, in3
   1519                  Outputs - out0, out1
   1520                  Return Type - as per RTYPE
   1521    Details     : Signed halfword elements from 'in0' are added to signed
   1522                  halfword elements of 'in1'. The result is then signed saturated
   1523                  between halfword data type range
   1524 */
   1525 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)   \
   1526   {                                                       \
   1527     out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
   1528     out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
   1529   }
   1530 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
   1531 
   1532 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
   1533                  out2, out3)                                                \
   1534   {                                                                         \
   1535     ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
   1536     ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
   1537   }
   1538 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
   1539 
   1540 /* Description : Shift left all elements of vector (generic for all data types)
   1541    Arguments   : Inputs  - in0, in1, in2, in3, shift
   1542                  Outputs - in place operation
   1543                  Return Type - as per input vector RTYPE
   1544    Details     : Each element of vector 'in0' is left shifted by 'shift' and
   1545                  the result is written in-place.
   1546 */
   1547 #define SLLI_4V(in0, in1, in2, in3, shift) \
   1548   {                                        \
   1549     in0 = in0 << shift;                    \
   1550     in1 = in1 << shift;                    \
   1551     in2 = in2 << shift;                    \
   1552     in3 = in3 << shift;                    \
   1553   }
   1554 
   1555 /* Description : Arithmetic shift right all elements of vector
   1556                  (generic for all data types)
   1557    Arguments   : Inputs  - in0, in1, in2, in3, shift
   1558                  Outputs - in place operation
   1559                  Return Type - as per input vector RTYPE
   1560    Details     : Each element of vector 'in0' is right shifted by 'shift' and
   1561                  the result is written in-place. 'shift' is a GP variable.
   1562 */
   1563 #define SRA_2V(in0, in1, shift) \
   1564   {                             \
   1565     in0 = in0 >> shift;         \
   1566     in1 = in1 >> shift;         \
   1567   }
   1568 
   1569 #define SRA_4V(in0, in1, in2, in3, shift) \
   1570   {                                       \
   1571     in0 = in0 >> shift;                   \
   1572     in1 = in1 >> shift;                   \
   1573     in2 = in2 >> shift;                   \
   1574     in3 = in3 >> shift;                   \
   1575   }
   1576 
   1577 /* Description : Shift right arithmetic rounded words
   1578    Arguments   : Inputs  - in0, in1, shift
   1579                  Outputs - in place operation
   1580                  Return Type - as per RTYPE
   1581    Details     : Each element of vector 'in0' is shifted right arithmetically by
   1582                  the number of bits in the corresponding element in the vector
   1583                  'shift'. The last discarded bit is added to shifted value for
   1584                  rounding and the result is written in-place.
   1585                  'shift' is a vector.
   1586 */
   1587 #define SRAR_W2(RTYPE, in0, in1, shift)                  \
   1588   {                                                      \
   1589     in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
   1590     in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
   1591   }
   1592 
   1593 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
   1594   {                                               \
   1595     SRAR_W2(RTYPE, in0, in1, shift)               \
   1596     SRAR_W2(RTYPE, in2, in3, shift)               \
   1597   }
   1598 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
   1599 
   1600 /* Description : Shift right arithmetic rounded (immediate)
   1601    Arguments   : Inputs  - in0, in1, shift
   1602                  Outputs - in place operation
   1603                  Return Type - as per RTYPE
   1604    Details     : Each element of vector 'in0' is shifted right arithmetically by
   1605                  the value in 'shift'. The last discarded bit is added to the
   1606                  shifted value for rounding and the result is written in-place.
   1607                  'shift' is an immediate value.
   1608 */
   1609 #define SRARI_H2(RTYPE, in0, in1, shift)           \
   1610   {                                                \
   1611     in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
   1612     in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
   1613   }
   1614 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
   1615 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
   1616 
   1617 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
   1618   {                                                \
   1619     SRARI_H2(RTYPE, in0, in1, shift);              \
   1620     SRARI_H2(RTYPE, in2, in3, shift);              \
   1621   }
   1622 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
   1623 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
   1624 
   1625 #define SRARI_W2(RTYPE, in0, in1, shift)           \
   1626   {                                                \
   1627     in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
   1628     in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
   1629   }
   1630 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
   1631 
   1632 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
   1633   {                                                \
   1634     SRARI_W2(RTYPE, in0, in1, shift);              \
   1635     SRARI_W2(RTYPE, in2, in3, shift);              \
   1636   }
   1637 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
   1638 
   1639 /* Description : Logical shift right all elements of vector (immediate)
   1640    Arguments   : Inputs  - in0, in1, in2, in3, shift
   1641                  Outputs - out0, out1, out2, out3
   1642                  Return Type - as per RTYPE
   1643    Details     : Each element of vector 'in0' is right shifted by 'shift' and
   1644                  the result is written in-place. 'shift' is an immediate value.
   1645 */
   1646 #define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \
   1647   {                                                                       \
   1648     out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                        \
   1649     out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                        \
   1650     out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                        \
   1651     out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                        \
   1652   }
   1653 #define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
   1654 
   1655 /* Description : Multiplication of pairs of vectors
   1656    Arguments   : Inputs  - in0, in1, in2, in3
   1657                  Outputs - out0, out1
   1658    Details     : Each element from 'in0' is multiplied with elements from 'in1'
   1659                  and the result is written to 'out0'
   1660 */
   1661 #define MUL2(in0, in1, in2, in3, out0, out1) \
   1662   {                                          \
   1663     out0 = in0 * in1;                        \
   1664     out1 = in2 * in3;                        \
   1665   }
   1666 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
   1667   {                                                                          \
   1668     MUL2(in0, in1, in2, in3, out0, out1);                                    \
   1669     MUL2(in4, in5, in6, in7, out2, out3);                                    \
   1670   }
   1671 
   1672 /* Description : Addition of 2 pairs of vectors
   1673    Arguments   : Inputs  - in0, in1, in2, in3
   1674                  Outputs - out0, out1
   1675    Details     : Each element in 'in0' is added to 'in1' and result is written
   1676                  to 'out0'.
   1677 */
   1678 #define ADD2(in0, in1, in2, in3, out0, out1) \
   1679   {                                          \
   1680     out0 = in0 + in1;                        \
   1681     out1 = in2 + in3;                        \
   1682   }
   1683 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
   1684   {                                                                          \
   1685     ADD2(in0, in1, in2, in3, out0, out1);                                    \
   1686     ADD2(in4, in5, in6, in7, out2, out3);                                    \
   1687   }
   1688 
   1689 /* Description : Subtraction of 2 pairs of vectors
   1690    Arguments   : Inputs  - in0, in1, in2, in3
   1691                  Outputs - out0, out1
   1692    Details     : Each element in 'in1' is subtracted from 'in0' and result is
   1693                  written to 'out0'.
   1694 */
   1695 #define SUB2(in0, in1, in2, in3, out0, out1) \
   1696   {                                          \
   1697     out0 = in0 - in1;                        \
   1698     out1 = in2 - in3;                        \
   1699   }
   1700 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
   1701   {                                                                          \
   1702     out0 = in0 - in1;                                                        \
   1703     out1 = in2 - in3;                                                        \
   1704     out2 = in4 - in5;                                                        \
   1705     out3 = in6 - in7;                                                        \
   1706   }
   1707 
   1708 /* Description : Sign extend halfword elements from right half of the vector
   1709    Arguments   : Input  - in    (halfword vector)
   1710                  Output - out   (sign extended word vector)
   1711                  Return Type - signed word
   1712    Details     : Sign bit of halfword elements from input vector 'in' is
   1713                  extracted and interleaved with same vector 'in0' to generate
   1714                  4 word elements keeping sign intact
   1715 */
   1716 #define UNPCK_R_SH_SW(in, out)                    \
   1717   {                                               \
   1718     v8i16 sign_m;                                 \
   1719                                                   \
   1720     sign_m = __msa_clti_s_h((v8i16)in, 0);        \
   1721     out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
   1722   }
   1723 
   1724 /* Description : Zero extend unsigned byte elements to halfword elements
   1725    Arguments   : Input   - in          (unsigned byte vector)
   1726                  Outputs - out0, out1  (unsigned  halfword vectors)
   1727                  Return Type - signed halfword
   1728    Details     : Zero extended right half of vector is returned in 'out0'
   1729                  Zero extended left half of vector is returned in 'out1'
   1730 */
   1731 #define UNPCK_UB_SH(in, out0, out1)      \
   1732   {                                      \
   1733     v16i8 zero_m = { 0 };                \
   1734                                          \
   1735     ILVRL_B2_SH(zero_m, in, out0, out1); \
   1736   }
   1737 
   1738 /* Description : Sign extend halfword elements from input vector and return
   1739                  the result in pair of vectors
   1740    Arguments   : Input   - in            (halfword vector)
   1741                  Outputs - out0, out1   (sign extended word vectors)
   1742                  Return Type - signed word
   1743    Details     : Sign bit of halfword elements from input vector 'in' is
   1744                  extracted and interleaved right with same vector 'in0' to
   1745                  generate 4 signed word elements in 'out0'
   1746                  Then interleaved left with same vector 'in0' to
   1747                  generate 4 signed word elements in 'out1'
   1748 */
   1749 #define UNPCK_SH_SW(in, out0, out1)       \
   1750   {                                       \
   1751     v8i16 tmp_m;                          \
   1752                                           \
   1753     tmp_m = __msa_clti_s_h((v8i16)in, 0); \
   1754     ILVRL_H2_SW(tmp_m, in, out0, out1);   \
   1755   }
   1756 
   1757 /* Description : Butterfly of 4 input vectors
   1758    Arguments   : Inputs  - in0, in1, in2, in3
   1759                  Outputs - out0, out1, out2, out3
   1760    Details     : Butterfly operation
   1761 */
   1762 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
   1763   {                                                             \
   1764     out0 = in0 + in3;                                           \
   1765     out1 = in1 + in2;                                           \
   1766                                                                 \
   1767     out2 = in1 - in2;                                           \
   1768     out3 = in0 - in3;                                           \
   1769   }
   1770 
   1771 /* Description : Butterfly of 8 input vectors
   1772    Arguments   : Inputs  - in0 ...  in7
   1773                  Outputs - out0 .. out7
   1774    Details     : Butterfly operation
   1775 */
   1776 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
   1777                     out3, out4, out5, out6, out7)                             \
   1778   {                                                                           \
   1779     out0 = in0 + in7;                                                         \
   1780     out1 = in1 + in6;                                                         \
   1781     out2 = in2 + in5;                                                         \
   1782     out3 = in3 + in4;                                                         \
   1783                                                                               \
   1784     out4 = in3 - in4;                                                         \
   1785     out5 = in2 - in5;                                                         \
   1786     out6 = in1 - in6;                                                         \
   1787     out7 = in0 - in7;                                                         \
   1788   }
   1789 
   1790 /* Description : Butterfly of 16 input vectors
   1791    Arguments   : Inputs  - in0 ...  in15
   1792                  Outputs - out0 .. out15
   1793    Details     : Butterfly operation
   1794 */
   1795 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,  \
   1796                      in11, in12, in13, in14, in15, out0, out1, out2, out3,    \
   1797                      out4, out5, out6, out7, out8, out9, out10, out11, out12, \
   1798                      out13, out14, out15)                                     \
   1799   {                                                                           \
   1800     out0 = in0 + in15;                                                        \
   1801     out1 = in1 + in14;                                                        \
   1802     out2 = in2 + in13;                                                        \
   1803     out3 = in3 + in12;                                                        \
   1804     out4 = in4 + in11;                                                        \
   1805     out5 = in5 + in10;                                                        \
   1806     out6 = in6 + in9;                                                         \
   1807     out7 = in7 + in8;                                                         \
   1808                                                                               \
   1809     out8 = in7 - in8;                                                         \
   1810     out9 = in6 - in9;                                                         \
   1811     out10 = in5 - in10;                                                       \
   1812     out11 = in4 - in11;                                                       \
   1813     out12 = in3 - in12;                                                       \
   1814     out13 = in2 - in13;                                                       \
   1815     out14 = in1 - in14;                                                       \
   1816     out15 = in0 - in15;                                                       \
   1817   }
   1818 
   1819 /* Description : Transpose input 8x8 byte block
   1820    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
   1821                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
   1822                  Return Type - as per RTYPE
   1823 */
   1824 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0,   \
   1825                         out1, out2, out3, out4, out5, out6, out7)              \
   1826   {                                                                            \
   1827     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
   1828     v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
   1829                                                                                \
   1830     ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
   1831                tmp3_m);                                                        \
   1832     ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                               \
   1833     ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                               \
   1834     ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                               \
   1835     ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                               \
   1836     SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                               \
   1837     SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                               \
   1838   }
   1839 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
   1840 
   1841 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
   1842    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
   1843                            in8, in9, in10, in11, in12, in13, in14, in15
   1844                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
   1845                  Return Type - unsigned byte
   1846 */
   1847 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
   1848                             in10, in11, in12, in13, in14, in15, out0, out1,   \
   1849                             out2, out3, out4, out5, out6, out7)               \
   1850   {                                                                           \
   1851     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
   1852     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                     \
   1853                                                                               \
   1854     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                              \
   1855     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                            \
   1856     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                            \
   1857     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                            \
   1858                                                                               \
   1859     tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                  \
   1860     tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                  \
   1861     tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                  \
   1862     tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                  \
   1863     out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                    \
   1864     tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                  \
   1865     out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                    \
   1866     tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                  \
   1867                                                                               \
   1868     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                  \
   1869     out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
   1870     out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
   1871                                                                               \
   1872     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);              \
   1873     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                  \
   1874     out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
   1875     out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
   1876                                                                               \
   1877     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);              \
   1878     out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
   1879     out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
   1880                                                                               \
   1881     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
   1882     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
   1883     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
   1884     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
   1885     out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
   1886     out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
   1887   }
   1888 
   1889 /* Description : Transpose 4x4 block with half word elements in vectors
   1890    Arguments   : Inputs  - in0, in1, in2, in3
   1891                  Outputs - out0, out1, out2, out3
   1892                  Return Type - signed halfword
   1893 */
   1894 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
   1895   {                                                                    \
   1896     v8i16 s0_m, s1_m;                                                  \
   1897                                                                        \
   1898     ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                        \
   1899     ILVRL_W2_SH(s1_m, s0_m, out0, out2);                               \
   1900     out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);              \
   1901     out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);              \
   1902   }
   1903 
   1904 /* Description : Transpose 4x8 block with half word elements in vectors
   1905    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
   1906                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
   1907                  Return Type - signed halfword
   1908 */
   1909 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
   1910                            out2, out3, out4, out5, out6, out7)                 \
   1911   {                                                                            \
   1912     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
   1913     v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                      \
   1914     v8i16 zero_m = { 0 };                                                      \
   1915                                                                                \
   1916     ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \
   1917                tmp3_n);                                                        \
   1918     ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                               \
   1919     ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                               \
   1920                                                                                \
   1921     out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
   1922     out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
   1923     out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
   1924     out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
   1925                                                                                \
   1926     out4 = zero_m;                                                             \
   1927     out5 = zero_m;                                                             \
   1928     out6 = zero_m;                                                             \
   1929     out7 = zero_m;                                                             \
   1930   }
   1931 
   1932 /* Description : Transpose 8x4 block with half word elements in vectors
   1933    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
   1934                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
   1935                  Return Type - signed halfword
   1936 */
   1937 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
   1938   {                                                                    \
   1939     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
   1940                                                                        \
   1941     ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                    \
   1942     ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                    \
   1943     ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);            \
   1944     ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);            \
   1945   }
   1946 
   1947 /* Description : Transpose 8x8 block with half word elements in vectors
   1948    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
   1949                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
   1950                  Return Type - as per RTYPE
   1951 */
   1952 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
   1953                        out1, out2, out3, out4, out5, out6, out7)            \
   1954   {                                                                         \
   1955     v8i16 s0_m, s1_m;                                                       \
   1956     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
   1957     v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
   1958                                                                             \
   1959     ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
   1960     ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
   1961     ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
   1962     ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
   1963     ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
   1964     ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
   1965     ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
   1966     ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
   1967     PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \
   1968              tmp7_m, out0, out2, out4, out6);                               \
   1969     out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
   1970     out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
   1971     out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
   1972     out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
   1973   }
   1974 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
   1975 
   1976 /* Description : Transpose 4x4 block with word elements in vectors
   1977    Arguments   : Inputs  - in0, in1, in2, in3
   1978                  Outputs - out0, out1, out2, out3
   1979                  Return Type - signed word
   1980 */
   1981 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
   1982   {                                                                    \
   1983     v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
   1984                                                                        \
   1985     ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                 \
   1986     ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                 \
   1987                                                                        \
   1988     out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);              \
   1989     out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);              \
   1990     out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);              \
   1991     out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);              \
   1992   }
   1993 
   1994 /* Description : Add block 4x4
   1995    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
   1996    Details     : Least significant 4 bytes from each input vector are added to
   1997                  the destination bytes, clipped between 0-255 and stored.
   1998 */
   1999 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)        \
   2000   {                                                              \
   2001     uint32_t src0_m, src1_m, src2_m, src3_m;                     \
   2002     v8i16 inp0_m, inp1_m, res0_m, res1_m;                        \
   2003     v16i8 dst0_m = { 0 };                                        \
   2004     v16i8 dst1_m = { 0 };                                        \
   2005     v16i8 zero_m = { 0 };                                        \
   2006                                                                  \
   2007     ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)               \
   2008     LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m);           \
   2009     INSERT_W2_SB(src0_m, src1_m, dst0_m);                        \
   2010     INSERT_W2_SB(src2_m, src3_m, dst1_m);                        \
   2011     ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);  \
   2012     ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);        \
   2013     CLIP_SH2_0_255(res0_m, res1_m);                              \
   2014     PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
   2015     ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);          \
   2016   }
   2017 
   2018 /* Description : Pack even elements of input vectors & xor with 128
   2019    Arguments   : Inputs - in0, in1
   2020                  Output - out_m
   2021                  Return Type - unsigned byte
   2022    Details     : Signed byte even elements from 'in0' and 'in1' are packed
   2023                  together in one vector and the resulting vector is xor'ed with
   2024                  128 to shift the range from signed to unsigned byte
   2025 */
   2026 #define PCKEV_XORI128_UB(in0, in1)                        \
   2027   ({                                                      \
   2028     v16u8 out_m;                                          \
   2029                                                           \
   2030     out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
   2031     out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);       \
   2032     out_m;                                                \
   2033   })
   2034 
   2035 /* Description : Converts inputs to unsigned bytes, interleave, average & store
   2036                  as 8x4 unsigned byte block
   2037    Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
   2038                           pdst, stride
   2039 */
   2040 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \
   2041                                 pdst, stride)                               \
   2042   {                                                                         \
   2043     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
   2044                                                                             \
   2045     tmp0_m = PCKEV_XORI128_UB(in0, in1);                                    \
   2046     tmp1_m = PCKEV_XORI128_UB(in2, in3);                                    \
   2047     ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
   2048     AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);            \
   2049     ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                 \
   2050   }
   2051 
   2052 /* Description : Pack even byte elements and store byte vector in destination
   2053                  memory
   2054    Arguments   : Inputs - in0, in1, pdst
   2055 */
   2056 #define PCKEV_ST_SB(in0, in1, pdst)                \
   2057   {                                                \
   2058     v16i8 tmp_m;                                   \
   2059                                                    \
   2060     tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
   2061     ST_SB(tmp_m, (pdst));                          \
   2062   }
   2063 
   2064 /* Description : Horizontal 2 tap filter kernel code
   2065    Arguments   : Inputs - in0, in1, mask, coeff, shift
   2066 */
   2067 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)        \
   2068   ({                                                            \
   2069     v16i8 tmp0_m;                                               \
   2070     v8u16 tmp1_m;                                               \
   2071                                                                 \
   2072     tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
   2073     tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);       \
   2074     tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);        \
   2075                                                                 \
   2076     tmp1_m;                                                     \
   2077   })
   2078 #endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */
   2079