Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #ifndef VPX_DSP_MIPS_MACROS_MSA_H_
     12 #define VPX_DSP_MIPS_MACROS_MSA_H_
     13 
     14 #include <msa.h>
     15 
     16 #include "./vpx_config.h"
     17 #include "vpx/vpx_integer.h"
     18 
     19 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
     20 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
     21 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
     22 
     23 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
     24 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
     25 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
     26 
     27 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
     28 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
     29 
     30 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
     31 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
     32 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
     33 
     34 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
     35 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
     36 
     37 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
     38 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
     39 
     40 #if (__mips_isa_rev >= 6)
     41 #define LH(psrc) ({                                 \
     42   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
     43   uint16_t val_m;                                   \
     44                                                     \
     45   __asm__ __volatile__ (                            \
     46       "lh  %[val_m],  %[psrc_m]  \n\t"              \
     47                                                     \
     48       : [val_m] "=r" (val_m)                        \
     49       : [psrc_m] "m" (*psrc_m)                      \
     50   );                                                \
     51                                                     \
     52   val_m;                                            \
     53 })
     54 
     55 #define LW(psrc) ({                                 \
     56   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
     57   uint32_t val_m;                                   \
     58                                                     \
     59   __asm__ __volatile__ (                            \
     60       "lw  %[val_m],  %[psrc_m]  \n\t"              \
     61                                                     \
     62       : [val_m] "=r" (val_m)                        \
     63       : [psrc_m] "m" (*psrc_m)                      \
     64   );                                                \
     65                                                     \
     66   val_m;                                            \
     67 })
     68 
     69 #if (__mips == 64)
     70 #define LD(psrc) ({                                 \
     71   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
     72   uint64_t val_m = 0;                               \
     73                                                     \
     74   __asm__ __volatile__ (                            \
     75       "ld  %[val_m],  %[psrc_m]  \n\t"              \
     76                                                     \
     77       : [val_m] "=r" (val_m)                        \
     78       : [psrc_m] "m" (*psrc_m)                      \
     79   );                                                \
     80                                                     \
     81   val_m;                                            \
     82 })
     83 #else  // !(__mips == 64)
     84 #define LD(psrc) ({                                        \
     85   const uint8_t *psrc_m = (const uint8_t *)(psrc);         \
     86   uint32_t val0_m, val1_m;                                 \
     87   uint64_t val_m = 0;                                      \
     88                                                            \
     89   val0_m = LW(psrc_m);                                     \
     90   val1_m = LW(psrc_m + 4);                                 \
     91                                                            \
     92   val_m = (uint64_t)(val1_m);                              \
     93   val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
     94   val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
     95                                                            \
     96   val_m;                                                   \
     97 })
     98 #endif  // (__mips == 64)
     99 
    100 #define SH(val, pdst) {                 \
    101   uint8_t *pdst_m = (uint8_t *)(pdst);  \
    102   const uint16_t val_m = (val);         \
    103                                         \
    104   __asm__ __volatile__ (                \
    105       "sh  %[val_m],  %[pdst_m]  \n\t"  \
    106                                         \
    107       : [pdst_m] "=m" (*pdst_m)         \
    108       : [val_m] "r" (val_m)             \
    109   );                                    \
    110 }
    111 
    112 #define SW(val, pdst) {                 \
    113   uint8_t *pdst_m = (uint8_t *)(pdst);  \
    114   const uint32_t val_m = (val);         \
    115                                         \
    116   __asm__ __volatile__ (                \
    117       "sw  %[val_m],  %[pdst_m]  \n\t"  \
    118                                         \
    119       : [pdst_m] "=m" (*pdst_m)         \
    120       : [val_m] "r" (val_m)             \
    121   );                                    \
    122 }
    123 
    124 #define SD(val, pdst) {                 \
    125   uint8_t *pdst_m = (uint8_t *)(pdst);  \
    126   const uint64_t val_m = (val);         \
    127                                         \
    128   __asm__ __volatile__ (                \
    129       "sd  %[val_m],  %[pdst_m]  \n\t"  \
    130                                         \
    131       : [pdst_m] "=m" (*pdst_m)         \
    132       : [val_m] "r" (val_m)             \
    133   );                                    \
    134 }
    135 #else  // !(__mips_isa_rev >= 6)
    136 #define LH(psrc) ({                                 \
    137   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
    138   uint16_t val_m;                                   \
    139                                                     \
    140   __asm__ __volatile__ (                            \
    141       "ulh  %[val_m],  %[psrc_m]  \n\t"             \
    142                                                     \
    143       : [val_m] "=r" (val_m)                        \
    144       : [psrc_m] "m" (*psrc_m)                      \
    145   );                                                \
    146                                                     \
    147   val_m;                                            \
    148 })
    149 
    150 #define LW(psrc) ({                                 \
    151   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
    152   uint32_t val_m;                                   \
    153                                                     \
    154   __asm__ __volatile__ (                            \
    155       "ulw  %[val_m],  %[psrc_m]  \n\t"             \
    156                                                     \
    157       : [val_m] "=r" (val_m)                        \
    158       : [psrc_m] "m" (*psrc_m)                      \
    159   );                                                \
    160                                                     \
    161   val_m;                                            \
    162 })
    163 
    164 #if (__mips == 64)
    165 #define LD(psrc) ({                                 \
    166   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
    167   uint64_t val_m = 0;                               \
    168                                                     \
    169   __asm__ __volatile__ (                            \
    170       "uld  %[val_m],  %[psrc_m]  \n\t"             \
    171                                                     \
    172       : [val_m] "=r" (val_m)                        \
    173       : [psrc_m] "m" (*psrc_m)                      \
    174   );                                                \
    175                                                     \
    176   val_m;                                            \
    177 })
    178 #else  // !(__mips == 64)
    179 #define LD(psrc) ({                                        \
    180   const uint8_t *psrc_m1 = (const uint8_t *)(psrc);        \
    181   uint32_t val0_m, val1_m;                                 \
    182   uint64_t val_m = 0;                                      \
    183                                                            \
    184   val0_m = LW(psrc_m1);                                    \
    185   val1_m = LW(psrc_m1 + 4);                                \
    186                                                            \
    187   val_m = (uint64_t)(val1_m);                              \
    188   val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
    189   val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
    190                                                            \
    191   val_m;                                                   \
    192 })
    193 #endif  // (__mips == 64)
    194 
    195 #define SH(val, pdst) {                  \
    196   uint8_t *pdst_m = (uint8_t *)(pdst);   \
    197   const uint16_t val_m = (val);          \
    198                                          \
    199   __asm__ __volatile__ (                 \
    200       "ush  %[val_m],  %[pdst_m]  \n\t"  \
    201                                          \
    202       : [pdst_m] "=m" (*pdst_m)          \
    203       : [val_m] "r" (val_m)              \
    204   );                                     \
    205 }
    206 
    207 #define SW(val, pdst) {                  \
    208   uint8_t *pdst_m = (uint8_t *)(pdst);   \
    209   const uint32_t val_m = (val);          \
    210                                          \
    211   __asm__ __volatile__ (                 \
    212       "usw  %[val_m],  %[pdst_m]  \n\t"  \
    213                                          \
    214       : [pdst_m] "=m" (*pdst_m)          \
    215       : [val_m] "r" (val_m)              \
    216   );                                     \
    217 }
    218 
    219 #define SD(val, pdst) {                                     \
    220   uint8_t *pdst_m1 = (uint8_t *)(pdst);                     \
    221   uint32_t val0_m, val1_m;                                  \
    222                                                             \
    223   val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \
    224   val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \
    225                                                             \
    226   SW(val0_m, pdst_m1);                                      \
    227   SW(val1_m, pdst_m1 + 4);                                  \
    228 }
    229 #endif  // (__mips_isa_rev >= 6)
    230 
    231 /* Description : Load 4 words with stride
    232    Arguments   : Inputs  - psrc, stride
    233                  Outputs - out0, out1, out2, out3
    234    Details     : Load word in 'out0' from (psrc)
    235                  Load word in 'out1' from (psrc + stride)
    236                  Load word in 'out2' from (psrc + 2 * stride)
    237                  Load word in 'out3' from (psrc + 3 * stride)
    238 */
    239 #define LW4(psrc, stride, out0, out1, out2, out3) {  \
    240   out0 = LW((psrc));                                 \
    241   out1 = LW((psrc) + stride);                        \
    242   out2 = LW((psrc) + 2 * stride);                    \
    243   out3 = LW((psrc) + 3 * stride);                    \
    244 }
    245 
    246 /* Description : Load double words with stride
    247    Arguments   : Inputs  - psrc, stride
    248                  Outputs - out0, out1
    249    Details     : Load double word in 'out0' from (psrc)
    250                  Load double word in 'out1' from (psrc + stride)
    251 */
    252 #define LD2(psrc, stride, out0, out1) {  \
    253   out0 = LD((psrc));                     \
    254   out1 = LD((psrc) + stride);            \
    255 }
    256 #define LD4(psrc, stride, out0, out1, out2, out3) {  \
    257   LD2((psrc), stride, out0, out1);                   \
    258   LD2((psrc) + 2 * stride, stride, out2, out3);      \
    259 }
    260 
    261 /* Description : Store 4 words with stride
    262    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
    263    Details     : Store word from 'in0' to (pdst)
    264                  Store word from 'in1' to (pdst + stride)
    265                  Store word from 'in2' to (pdst + 2 * stride)
    266                  Store word from 'in3' to (pdst + 3 * stride)
    267 */
    268 #define SW4(in0, in1, in2, in3, pdst, stride) {  \
    269   SW(in0, (pdst))                                \
    270   SW(in1, (pdst) + stride);                      \
    271   SW(in2, (pdst) + 2 * stride);                  \
    272   SW(in3, (pdst) + 3 * stride);                  \
    273 }
    274 
    275 /* Description : Store 4 double words with stride
    276    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
    277    Details     : Store double word from 'in0' to (pdst)
    278                  Store double word from 'in1' to (pdst + stride)
    279                  Store double word from 'in2' to (pdst + 2 * stride)
    280                  Store double word from 'in3' to (pdst + 3 * stride)
    281 */
    282 #define SD4(in0, in1, in2, in3, pdst, stride) {  \
    283   SD(in0, (pdst))                                \
    284   SD(in1, (pdst) + stride);                      \
    285   SD(in2, (pdst) + 2 * stride);                  \
    286   SD(in3, (pdst) + 3 * stride);                  \
    287 }
    288 
    289 /* Description : Load vectors with 16 byte elements with stride
    290    Arguments   : Inputs  - psrc, stride
    291                  Outputs - out0, out1
    292                  Return Type - as per RTYPE
    293    Details     : Load 16 byte elements in 'out0' from (psrc)
    294                  Load 16 byte elements in 'out1' from (psrc + stride)
    295 */
    296 #define LD_B2(RTYPE, psrc, stride, out0, out1) {  \
    297   out0 = LD_B(RTYPE, (psrc));                     \
    298   out1 = LD_B(RTYPE, (psrc) + stride);            \
    299 }
    300 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
    301 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
    302 
    303 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) {  \
    304   LD_B2(RTYPE, (psrc), stride, out0, out1);             \
    305   out2 = LD_B(RTYPE, (psrc) + 2 * stride);              \
    306 }
    307 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
    308 
    309 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
    310   LD_B2(RTYPE, (psrc), stride, out0, out1);                   \
    311   LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);     \
    312 }
    313 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
    314 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
    315 
    316 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) {  \
    317   LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);             \
    318   out4 = LD_B(RTYPE, (psrc) + 4 * stride);                          \
    319 }
    320 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
    321 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
    322 
    323 #define LD_B7(RTYPE, psrc, stride,                             \
    324               out0, out1, out2, out3, out4, out5, out6) {      \
    325   LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
    326   LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
    327 }
    328 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
    329 
    330 #define LD_B8(RTYPE, psrc, stride,                                    \
    331               out0, out1, out2, out3, out4, out5, out6, out7) {       \
    332   LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
    333   LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
    334 }
    335 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
    336 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
    337 
    338 /* Description : Load vectors with 8 halfword elements with stride
    339    Arguments   : Inputs  - psrc, stride
    340                  Outputs - out0, out1
    341    Details     : Load 8 halfword elements in 'out0' from (psrc)
    342                  Load 8 halfword elements in 'out1' from (psrc + stride)
    343 */
    344 #define LD_H2(RTYPE, psrc, stride, out0, out1) {  \
    345   out0 = LD_H(RTYPE, (psrc));                     \
    346   out1 = LD_H(RTYPE, (psrc) + (stride));          \
    347 }
    348 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
    349 
    350 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
    351   LD_H2(RTYPE, (psrc), stride, out0, out1);                   \
    352   LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);      \
    353 }
    354 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
    355 
    356 #define LD_H8(RTYPE, psrc, stride,                                    \
    357               out0, out1, out2, out3, out4, out5, out6, out7) {       \
    358   LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
    359   LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
    360 }
    361 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
    362 
    363 #define LD_H16(RTYPE, psrc, stride,                                     \
    364                out0, out1, out2, out3, out4, out5, out6, out7,          \
    365                out8, out9, out10, out11, out12, out13, out14, out15) {  \
    366   LD_H8(RTYPE, (psrc), stride,                                          \
    367         out0, out1, out2, out3, out4, out5, out6, out7);                \
    368   LD_H8(RTYPE, (psrc) + 8 * stride, stride,                             \
    369         out8, out9, out10, out11, out12, out13, out14, out15);          \
    370 }
    371 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
    372 
    373 /* Description : Load 4x4 block of signed halfword elements from 1D source
    374                  data into 4 vectors (Each vector with 4 signed halfwords)
    375    Arguments   : Input   - psrc
    376                  Outputs - out0, out1, out2, out3
    377 */
    378 #define LD4x4_SH(psrc, out0, out1, out2, out3) {         \
    379   out0 = LD_SH(psrc);                                    \
    380   out2 = LD_SH(psrc + 8);                                \
    381   out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);  \
    382   out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2);  \
    383 }
    384 
    385 /* Description : Load 2 vectors of signed word elements with stride
    386    Arguments   : Inputs  - psrc, stride
    387                  Outputs - out0, out1
    388                  Return Type - signed word
    389 */
    390 #define LD_SW2(psrc, stride, out0, out1) {  \
    391   out0 = LD_SW((psrc));                     \
    392   out1 = LD_SW((psrc) + stride);            \
    393 }
    394 
    395 /* Description : Store vectors of 16 byte elements with stride
    396    Arguments   : Inputs - in0, in1, pdst, stride
    397    Details     : Store 16 byte elements from 'in0' to (pdst)
    398                  Store 16 byte elements from 'in1' to (pdst + stride)
    399 */
    400 #define ST_B2(RTYPE, in0, in1, pdst, stride) {  \
    401   ST_B(RTYPE, in0, (pdst));                     \
    402   ST_B(RTYPE, in1, (pdst) + stride);            \
    403 }
    404 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
    405 
    406 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
    407   ST_B2(RTYPE, in0, in1, (pdst), stride);                 \
    408   ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
    409 }
    410 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
    411 
    412 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,      \
    413               pdst, stride) {                                     \
    414   ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \
    415   ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \
    416 }
    417 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
    418 
    419 /* Description : Store vectors of 8 halfword elements with stride
    420    Arguments   : Inputs - in0, in1, pdst, stride
    421    Details     : Store 8 halfword elements from 'in0' to (pdst)
    422                  Store 8 halfword elements from 'in1' to (pdst + stride)
    423 */
    424 #define ST_H2(RTYPE, in0, in1, pdst, stride) {  \
    425   ST_H(RTYPE, in0, (pdst));                     \
    426   ST_H(RTYPE, in1, (pdst) + stride);            \
    427 }
    428 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
    429 
    430 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
    431   ST_H2(RTYPE, in0, in1, (pdst), stride);                 \
    432   ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
    433 }
    434 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
    435 
    436 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) {  \
    437   ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                           \
    438   ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);              \
    439 }
    440 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
    441 
    442 /* Description : Store vectors of word elements with stride
    443    Arguments   : Inputs - in0, in1, pdst, stride
    444    Details     : Store 4 word elements from 'in0' to (pdst)
    445                  Store 4 word elements from 'in1' to (pdst + stride)
    446 */
    447 #define ST_SW2(in0, in1, pdst, stride) {  \
    448   ST_SW(in0, (pdst));                     \
    449   ST_SW(in1, (pdst) + stride);            \
    450 }
    451 
    452 /* Description : Store 2x4 byte block to destination memory from input vector
    453    Arguments   : Inputs - in, stidx, pdst, stride
    454    Details     : Index 'stidx' halfword element from 'in' vector is copied to
    455                  the GP register and stored to (pdst)
    456                  Index 'stidx+1' halfword element from 'in' vector is copied to
    457                  the GP register and stored to (pdst + stride)
    458                  Index 'stidx+2' halfword element from 'in' vector is copied to
    459                  the GP register and stored to (pdst + 2 * stride)
    460                  Index 'stidx+3' halfword element from 'in' vector is copied to
    461                  the GP register and stored to (pdst + 3 * stride)
    462 */
    463 #define ST2x4_UB(in, stidx, pdst, stride) {         \
    464   uint16_t out0_m, out1_m, out2_m, out3_m;          \
    465   uint8_t *pblk_2x4_m = (uint8_t *)(pdst);          \
    466                                                     \
    467   out0_m = __msa_copy_u_h((v8i16)in, (stidx));      \
    468   out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1));  \
    469   out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2));  \
    470   out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3));  \
    471                                                     \
    472   SH(out0_m, pblk_2x4_m);                           \
    473   SH(out1_m, pblk_2x4_m + stride);                  \
    474   SH(out2_m, pblk_2x4_m + 2 * stride);              \
    475   SH(out3_m, pblk_2x4_m + 3 * stride);              \
    476 }
    477 
    478 /* Description : Store 4x2 byte block to destination memory from input vector
    479    Arguments   : Inputs - in, pdst, stride
    480    Details     : Index 0 word element from 'in' vector is copied to the GP
    481                  register and stored to (pdst)
    482                  Index 1 word element from 'in' vector is copied to the GP
    483                  register and stored to (pdst + stride)
    484 */
    485 #define ST4x2_UB(in, pdst, stride) {        \
    486   uint32_t out0_m, out1_m;                  \
    487   uint8_t *pblk_4x2_m = (uint8_t *)(pdst);  \
    488                                             \
    489   out0_m = __msa_copy_u_w((v4i32)in, 0);    \
    490   out1_m = __msa_copy_u_w((v4i32)in, 1);    \
    491                                             \
    492   SW(out0_m, pblk_4x2_m);                   \
    493   SW(out1_m, pblk_4x2_m + stride);          \
    494 }
    495 
    496 /* Description : Store 4x4 byte block to destination memory from input vector
    497    Arguments   : Inputs - in0, in1, pdst, stride
    498    Details     : 'Idx0' word element from input vector 'in0' is copied to the
    499                  GP register and stored to (pdst)
    500                  'Idx1' word element from input vector 'in0' is copied to the
    501                  GP register and stored to (pdst + stride)
    502                  'Idx2' word element from input vector 'in0' is copied to the
    503                  GP register and stored to (pdst + 2 * stride)
    504                  'Idx3' word element from input vector 'in0' is copied to the
    505                  GP register and stored to (pdst + 3 * stride)
    506 */
    507 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \
    508   uint32_t out0_m, out1_m, out2_m, out3_m;                          \
    509   uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                          \
    510                                                                     \
    511   out0_m = __msa_copy_u_w((v4i32)in0, idx0);                        \
    512   out1_m = __msa_copy_u_w((v4i32)in0, idx1);                        \
    513   out2_m = __msa_copy_u_w((v4i32)in1, idx2);                        \
    514   out3_m = __msa_copy_u_w((v4i32)in1, idx3);                        \
    515                                                                     \
    516   SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);          \
    517 }
    518 #define ST4x8_UB(in0, in1, pdst, stride) {                        \
    519   uint8_t *pblk_4x8 = (uint8_t *)(pdst);                          \
    520                                                                   \
    521   ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
    522   ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
    523 }
    524 
    525 /* Description : Store 8x1 byte block to destination memory from input vector
    526    Arguments   : Inputs - in, pdst
    527    Details     : Index 0 double word element from 'in' vector is copied to the
    528                  GP register and stored to (pdst)
    529 */
    530 #define ST8x1_UB(in, pdst) {              \
    531   uint64_t out0_m;                        \
    532                                           \
    533   out0_m = __msa_copy_u_d((v2i64)in, 0);  \
    534   SD(out0_m, pdst);                       \
    535 }
    536 
    537 /* Description : Store 8x2 byte block to destination memory from input vector
    538    Arguments   : Inputs - in, pdst, stride
    539    Details     : Index 0 double word element from 'in' vector is copied to the
    540                  GP register and stored to (pdst)
    541                  Index 1 double word element from 'in' vector is copied to the
    542                  GP register and stored to (pdst + stride)
    543 */
    544 #define ST8x2_UB(in, pdst, stride) {        \
    545   uint64_t out0_m, out1_m;                  \
    546   uint8_t *pblk_8x2_m = (uint8_t *)(pdst);  \
    547                                             \
    548   out0_m = __msa_copy_u_d((v2i64)in, 0);    \
    549   out1_m = __msa_copy_u_d((v2i64)in, 1);    \
    550                                             \
    551   SD(out0_m, pblk_8x2_m);                   \
    552   SD(out1_m, pblk_8x2_m + stride);          \
    553 }
    554 
    555 /* Description : Store 8x4 byte block to destination memory from input
    556                  vectors
    557    Arguments   : Inputs - in0, in1, pdst, stride
    558    Details     : Index 0 double word element from 'in0' vector is copied to the
    559                  GP register and stored to (pdst)
    560                  Index 1 double word element from 'in0' vector is copied to the
    561                  GP register and stored to (pdst + stride)
    562                  Index 0 double word element from 'in1' vector is copied to the
    563                  GP register and stored to (pdst + 2 * stride)
    564                  Index 1 double word element from 'in1' vector is copied to the
    565                  GP register and stored to (pdst + 3 * stride)
    566 */
    567 #define ST8x4_UB(in0, in1, pdst, stride) {                  \
    568   uint64_t out0_m, out1_m, out2_m, out3_m;                  \
    569   uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                  \
    570                                                             \
    571   out0_m = __msa_copy_u_d((v2i64)in0, 0);                   \
    572   out1_m = __msa_copy_u_d((v2i64)in0, 1);                   \
    573   out2_m = __msa_copy_u_d((v2i64)in1, 0);                   \
    574   out3_m = __msa_copy_u_d((v2i64)in1, 1);                   \
    575                                                             \
    576   SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
    577 }
    578 
    579 /* Description : average with rounding (in0 + in1 + 1) / 2.
    580    Arguments   : Inputs  - in0, in1, in2, in3,
    581                  Outputs - out0, out1
    582                  Return Type - as per RTYPE
    583    Details     : Each unsigned byte element from 'in0' vector is added with
    584                  each unsigned byte element from 'in1' vector. Then the average
    585                  with rounding is calculated and written to 'out0'
    586 */
    587 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
    588   out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1);    \
    589   out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3);    \
    590 }
    591 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
    592 
    593 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
    594                  out0, out1, out2, out3) {                       \
    595   AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                \
    596   AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                \
    597 }
    598 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
    599 
    600 /* Description : Immediate number of elements to slide with zero
    601    Arguments   : Inputs  - in0, in1, slide_val
    602                  Outputs - out0, out1
    603                  Return Type - as per RTYPE
    604    Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
    605                  value specified in the 'slide_val'
    606 */
    607 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) {          \
    608   v16i8 zero_m = { 0 };                                              \
    609   out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val);  \
    610   out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val);  \
    611 }
    612 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
    613 
    614 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
    615                   out0, out1, out2, out3, slide_val) {  \
    616   SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);    \
    617   SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);    \
    618 }
    619 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
    620 
    621 /* Description : Immediate number of elements to slide
    622    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
    623                  Outputs - out0, out1
    624                  Return Type - as per RTYPE
    625    Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
    626                  value specified in the 'slide_val'
    627 */
    628 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) {  \
    629   out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);         \
    630   out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);         \
    631 }
    632 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
    633 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
    634 
    635 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2,      \
    636                 out0, out1, out2, slide_val) {                        \
    637   SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)   \
    638   out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);  \
    639 }
    640 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
    641 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
    642 
    643 /* Description : Shuffle byte vector elements as per mask vector
    644    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
    645                  Outputs - out0, out1
    646                  Return Type - as per RTYPE
    647    Details     : Byte elements from 'in0' & 'in1' are copied selectively to
    648                  'out0' as per control vector 'mask0'
    649 */
    650 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \
    651   out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0);     \
    652   out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2);     \
    653 }
    654 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
    655 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
    656 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
    657 
    658 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,     \
    659                 out0, out1, out2, out3) {                        \
    660   VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
    661   VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
    662 }
    663 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
    664 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
    665 
    666 /* Description : Dot product of byte vector elements
    667    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    668                  Outputs - out0, out1
    669                  Return Type - as per RTYPE
    670    Details     : Unsigned byte elements from 'mult0' are multiplied with
    671                  unsigned byte elements from 'cnst0' producing a result
    672                  twice the size of input i.e. unsigned halfword.
    673                  The multiplication result of adjacent odd-even elements
    674                  are added together and written to the 'out0' vector
    675 */
    676 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
    677   out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);        \
    678   out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);        \
    679 }
    680 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
    681 
    682 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,         \
    683                  cnst0, cnst1, cnst2, cnst3,                \
    684                  out0, out1, out2, out3) {                  \
    685   DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
    686   DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
    687 }
    688 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
    689 
    690 /* Description : Dot product of byte vector elements
    691    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    692                  Outputs - out0, out1
    693                  Return Type - as per RTYPE
    694    Details     : Signed byte elements from 'mult0' are multiplied with
    695                  signed byte elements from 'cnst0' producing a result
    696                  twice the size of input i.e. signed halfword.
    697                  The multiplication result of adjacent odd-even elements
    698                  are added together and written to the 'out0' vector
    699 */
    700 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
    701   out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);        \
    702   out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);        \
    703 }
    704 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
    705 
    706 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
    707                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
    708   DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
    709   DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
    710 }
    711 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
    712 
    713 /* Description : Dot product of halfword vector elements
    714    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    715                  Outputs - out0, out1
    716                  Return Type - as per RTYPE
    717    Details     : Signed halfword elements from 'mult0' are multiplied with
    718                  signed halfword elements from 'cnst0' producing a result
    719                  twice the size of input i.e. signed word.
    720                  The multiplication result of adjacent odd-even elements
    721                  are added together and written to the 'out0' vector
    722 */
    723 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
    724   out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);        \
    725   out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);        \
    726 }
    727 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
    728 
    729 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,         \
    730                  cnst0, cnst1, cnst2, cnst3,                \
    731                  out0, out1, out2, out3) {                  \
    732   DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
    733   DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
    734 }
    735 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
    736 
    737 /* Description : Dot product of word vector elements
    738    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    739                  Outputs - out0, out1
    740                  Return Type - as per RTYPE
    741    Details     : Signed word elements from 'mult0' are multiplied with
    742                  signed word elements from 'cnst0' producing a result
    743                  twice the size of input i.e. signed double word.
    744                  The multiplication result of adjacent odd-even elements
    745                  are added together and written to the 'out0' vector
    746 */
    747 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
    748   out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);        \
    749   out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);        \
    750 }
    751 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
    752 
    753 /* Description : Dot product & addition of byte vector elements
    754    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    755                  Outputs - out0, out1
    756                  Return Type - as per RTYPE
    757    Details     : Signed byte elements from 'mult0' are multiplied with
    758                  signed byte elements from 'cnst0' producing a result
    759                  twice the size of input i.e. signed halfword.
    760                  The multiplication result of adjacent odd-even elements
    761                  are added to the 'out0' vector
    762 */
    763 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
    764   out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0);  \
    765   out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1);  \
    766 }
    767 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
    768 
    769 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
    770                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
    771   DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
    772   DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
    773 }
    774 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
    775 
    776 /* Description : Dot product & addition of halfword vector elements
    777    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
    778                  Outputs - out0, out1
    779                  Return Type - as per RTYPE
    780    Details     : Signed halfword elements from 'mult0' are multiplied with
    781                  signed halfword elements from 'cnst0' producing a result
    782                  twice the size of input i.e. signed word.
    783                  The multiplication result of adjacent odd-even elements
    784                  are added to the 'out0' vector
    785 */
    786 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
    787   out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \
    788   out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \
    789 }
    790 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
    791 
    792 /* Description : Dot product & addition of double word vector elements
    793    Arguments   : Inputs  - mult0, mult1
    794                  Outputs - out0, out1
    795                  Return Type - as per RTYPE
    796    Details     : Each signed word element from 'mult0' is multiplied with itself
    797                  producing an intermediate result twice the size of input
    798                  i.e. signed double word
    799                  The multiplication result of adjacent odd-even elements
    800                  are added to the 'out0' vector
    801 */
    802 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) {                       \
    803   out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0);  \
    804   out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1);  \
    805 }
    806 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
    807 
    808 /* Description : Minimum values between unsigned elements of
    809                  either vector are copied to the output vector
    810    Arguments   : Inputs  - in0, in1, min_vec
    811                  Outputs - in place operation
    812                  Return Type - as per RTYPE
    813    Details     : Minimum of unsigned halfword element values from 'in0' and
    814                  'min_vec' are written to output vector 'in0'
    815 */
    816 #define MIN_UH2(RTYPE, in0, in1, min_vec) {         \
    817   in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec);  \
    818   in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec);  \
    819 }
    820 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
    821 
    822 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) {  \
    823   MIN_UH2(RTYPE, in0, in1, min_vec);                   \
    824   MIN_UH2(RTYPE, in2, in3, min_vec);                   \
    825 }
    826 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
    827 
    828 /* Description : Clips all signed halfword elements of input vector
    829                  between 0 & 255
    830    Arguments   : Input  - in
    831                  Output - out_m
    832                  Return Type - signed halfword
    833 */
    834 #define CLIP_SH_0_255(in) ({                          \
    835   v8i16 max_m = __msa_ldi_h(255);                     \
    836   v8i16 out_m;                                        \
    837                                                       \
    838   out_m = __msa_maxi_s_h((v8i16)in, 0);               \
    839   out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m);  \
    840   out_m;                                              \
    841 })
    842 #define CLIP_SH2_0_255(in0, in1) {  \
    843   in0 = CLIP_SH_0_255(in0);         \
    844   in1 = CLIP_SH_0_255(in1);         \
    845 }
    846 #define CLIP_SH4_0_255(in0, in1, in2, in3) {  \
    847   CLIP_SH2_0_255(in0, in1);                   \
    848   CLIP_SH2_0_255(in2, in3);                   \
    849 }
    850 
    851 /* Description : Horizontal addition of 4 signed word elements of input vector
    852    Arguments   : Input  - in       (signed word vector)
    853                  Output - sum_m    (i32 sum)
    854                  Return Type - signed word (GP)
    855    Details     : 4 signed word elements of 'in' vector are added together and
    856                  the resulting integer sum is returned
    857 */
    858 #define HADD_SW_S32(in) ({                        \
    859   v2i64 res0_m, res1_m;                           \
    860   int32_t sum_m;                                  \
    861                                                   \
    862   res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);  \
    863   res1_m = __msa_splati_d(res0_m, 1);             \
    864   res0_m = res0_m + res1_m;                       \
    865   sum_m = __msa_copy_s_w((v4i32)res0_m, 0);       \
    866   sum_m;                                          \
    867 })
    868 
    869 /* Description : Horizontal addition of 8 unsigned halfword elements
    870    Arguments   : Inputs  - in       (unsigned halfword vector)
    871                  Outputs - sum_m    (u32 sum)
    872                  Return Type - unsigned word
    873    Details     : 8 unsigned halfword elements of input vector are added
    874                  together and the resulting integer sum is returned
    875 */
    876 #define HADD_UH_U32(in) ({                           \
    877   v4u32 res_m;                                       \
    878   v2u64 res0_m, res1_m;                              \
    879   uint32_t sum_m;                                    \
    880                                                      \
    881   res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);      \
    882   res0_m = __msa_hadd_u_d(res_m, res_m);             \
    883   res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);  \
    884   res0_m = res0_m + res1_m;                          \
    885   sum_m = __msa_copy_u_w((v4i32)res0_m, 0);          \
    886   sum_m;                                             \
    887 })
    888 
    889 /* Description : Horizontal addition of unsigned byte vector elements
    890    Arguments   : Inputs  - in0, in1
    891                  Outputs - out0, out1
    892                  Return Type - as per RTYPE
    893    Details     : Each unsigned odd byte element from 'in0' is added to
    894                  even unsigned byte element from 'in0' (pairwise) and the
    895                  halfword result is written to 'out0'
    896 */
    897 #define HADD_UB2(RTYPE, in0, in1, out0, out1) {          \
    898   out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \
    899   out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1);  \
    900 }
    901 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
    902 
    903 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \
    904   HADD_UB2(RTYPE, in0, in1, out0, out1);                               \
    905   HADD_UB2(RTYPE, in2, in3, out2, out3);                               \
    906 }
    907 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
    908 
    909 /* Description : Horizontal subtraction of unsigned byte vector elements
    910    Arguments   : Inputs  - in0, in1
    911                  Outputs - out0, out1
    912                  Return Type - as per RTYPE
    913    Details     : Each unsigned odd byte element from 'in0' is subtracted from
    914                  even unsigned byte element from 'in0' (pairwise) and the
    915                  halfword result is written to 'out0'
    916 */
    917 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) {          \
    918   out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \
    919   out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \
    920 }
    921 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
    922 
    923 /* Description : SAD (Sum of Absolute Difference)
    924    Arguments   : Inputs  - in0, in1, ref0, ref1
    925                  Outputs - sad_m                 (halfword vector)
    926                  Return Type - unsigned halfword
    927    Details     : Absolute difference of all the byte elements from 'in0' with
    928                  'ref0' is calculated and preserved in 'diff0'. Then even-odd
    929                  pairs are added together to generate 8 halfword results.
    930 */
    931 #define SAD_UB2_UH(in0, in1, ref0, ref1) ({                 \
    932   v16u8 diff0_m, diff1_m;                                   \
    933   v8u16 sad_m = { 0 };                                      \
    934                                                             \
    935   diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);        \
    936   diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);        \
    937                                                             \
    938   sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m);  \
    939   sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m);  \
    940                                                             \
    941   sad_m;                                                    \
    942 })
    943 
    944 /* Description : Horizontal subtraction of signed halfword vector elements
    945    Arguments   : Inputs  - in0, in1
    946                  Outputs - out0, out1
    947                  Return Type - as per RTYPE
    948    Details     : Each signed odd halfword element from 'in0' is subtracted from
    949                  even signed halfword element from 'in0' (pairwise) and the
    950                  word result is written to 'out0'
    951 */
    952 #define HSUB_UH2(RTYPE, in0, in1, out0, out1) {          \
    953   out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0);  \
    954   out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1);  \
    955 }
    956 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
    957 
    958 /* Description : Set element n input vector to GPR value
    959    Arguments   : Inputs - in0, in1, in2, in3
    960                  Output - out
    961                  Return Type - as per RTYPE
    962    Details     : Set element 0 in vector 'out' to value specified in 'in0'
    963 */
    964 #define INSERT_W2(RTYPE, in0, in1, out) {           \
    965   out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \
    966   out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \
    967 }
    968 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
    969 
    970 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) {  \
    971   out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);   \
    972   out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);   \
    973   out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);   \
    974   out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);   \
    975 }
    976 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
    977 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
    978 
    979 #define INSERT_D2(RTYPE, in0, in1, out) {           \
    980   out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
    981   out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
    982 }
    983 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
    984 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
    985 
    986 /* Description : Interleave even byte elements from vectors
    987    Arguments   : Inputs  - in0, in1, in2, in3
    988                  Outputs - out0, out1
    989                  Return Type - as per RTYPE
    990    Details     : Even byte elements of 'in0' and 'in1' are interleaved
    991                  and written to 'out0'
    992 */
    993 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
    994   out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);     \
    995   out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);     \
    996 }
    997 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
    998 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
    999 
   1000 /* Description : Interleave even halfword elements from vectors
   1001    Arguments   : Inputs  - in0, in1, in2, in3
   1002                  Outputs - out0, out1
   1003                  Return Type - as per RTYPE
   1004    Details     : Even halfword elements of 'in0' and 'in1' are interleaved
   1005                  and written to 'out0'
   1006 */
   1007 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   1008   out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);     \
   1009   out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);     \
   1010 }
   1011 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
   1012 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
   1013 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
   1014 
   1015 /* Description : Interleave even word elements from vectors
   1016    Arguments   : Inputs  - in0, in1, in2, in3
   1017                  Outputs - out0, out1
   1018                  Return Type - as per RTYPE
   1019    Details     : Even word elements of 'in0' and 'in1' are interleaved
   1020                  and written to 'out0'
   1021 */
   1022 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   1023   out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);     \
   1024   out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);     \
   1025 }
   1026 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
   1027 
   1028 /* Description : Interleave even double word elements from vectors
   1029    Arguments   : Inputs  - in0, in1, in2, in3
   1030                  Outputs - out0, out1
   1031                  Return Type - as per RTYPE
   1032    Details     : Even double word elements of 'in0' and 'in1' are interleaved
   1033                  and written to 'out0'
   1034 */
   1035 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   1036   out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);     \
   1037   out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);     \
   1038 }
   1039 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
   1040 
   1041 /* Description : Interleave left half of byte elements from vectors
   1042    Arguments   : Inputs  - in0, in1, in2, in3
   1043                  Outputs - out0, out1
   1044                  Return Type - as per RTYPE
   1045    Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
   1046                  and written to 'out0'.
   1047 */
   1048 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   1049   out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);     \
   1050   out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);     \
   1051 }
   1052 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
   1053 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
   1054 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
   1055 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
   1056 
   1057 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
   1058                 out0, out1, out2, out3) {                       \
   1059   ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
   1060   ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
   1061 }
   1062 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
   1063 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
   1064 
   1065 /* Description : Interleave left half of halfword elements from vectors
   1066    Arguments   : Inputs  - in0, in1, in2, in3
   1067                  Outputs - out0, out1
   1068                  Return Type - as per RTYPE
   1069    Details     : Left half of halfword elements of 'in0' and 'in1' are
   1070                  interleaved and written to 'out0'.
   1071 */
   1072 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   1073   out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);     \
   1074   out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3);     \
   1075 }
   1076 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
   1077 
   1078 /* Description : Interleave left half of word elements from vectors
   1079    Arguments   : Inputs  - in0, in1, in2, in3
   1080                  Outputs - out0, out1
   1081                  Return Type - as per RTYPE
   1082    Details     : Left half of word elements of 'in0' and 'in1' are interleaved
   1083                  and written to 'out0'.
   1084 */
   1085 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   1086   out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);     \
   1087   out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3);     \
   1088 }
   1089 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
   1090 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
   1091 
   1092 /* Description : Interleave right half of byte elements from vectors
   1093    Arguments   : Inputs  - in0, in1, in2, in3
   1094                  Outputs - out0, out1
   1095                  Return Type - as per RTYPE
   1096    Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
   1097                  and written to out0.
   1098 */
   1099 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   1100   out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \
   1101   out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);     \
   1102 }
   1103 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
   1104 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
   1105 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
   1106 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
   1107 
   1108 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
   1109                 out0, out1, out2, out3) {                       \
   1110   ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
   1111   ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
   1112 }
   1113 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
   1114 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
   1115 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
   1116 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
   1117 
   1118 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
   1119                 in8, in9, in10, in11, in12, in13, in14, in15,      \
   1120                 out0, out1, out2, out3, out4, out5, out6, out7) {  \
   1121   ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,           \
   1122           out0, out1, out2, out3);                                 \
   1123   ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,     \
   1124           out4, out5, out6, out7);                                 \
   1125 }
   1126 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
   1127 
   1128 /* Description : Interleave right half of halfword elements from vectors
   1129    Arguments   : Inputs  - in0, in1, in2, in3
   1130                  Outputs - out0, out1
   1131                  Return Type - as per RTYPE
   1132    Details     : Right half of halfword elements of 'in0' and 'in1' are
   1133                  interleaved and written to 'out0'.
   1134 */
   1135 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   1136   out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \
   1137   out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);     \
   1138 }
   1139 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
   1140 
   1141 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
   1142                 out0, out1, out2, out3) {                       \
   1143   ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
   1144   ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
   1145 }
   1146 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
   1147 
   1148 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   1149   out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);     \
   1150   out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3);     \
   1151 }
   1152 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
   1153 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
   1154 
   1155 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
   1156                 out0, out1, out2, out3) {                       \
   1157   ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);               \
   1158   ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);               \
   1159 }
   1160 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
   1161 
   1162 /* Description : Interleave right half of double word elements from vectors
   1163    Arguments   : Inputs  - in0, in1, in2, in3
   1164                  Outputs - out0, out1
   1165                  Return Type - as per RTYPE
   1166    Details     : Right half of double word elements of 'in0' and 'in1' are
   1167                  interleaved and written to 'out0'.
   1168 */
   1169 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {   \
   1170   out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1));  \
   1171   out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3));  \
   1172 }
   1173 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
   1174 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
   1175 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
   1176 
   1177 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) {  \
   1178   ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                         \
   1179   out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));                 \
   1180 }
   1181 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
   1182 
   1183 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
   1184                 out0, out1, out2, out3) {                       \
   1185   ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
   1186   ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
   1187 }
   1188 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
   1189 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
   1190 
   1191 /* Description : Interleave both left and right half of input vectors
   1192    Arguments   : Inputs  - in0, in1
   1193                  Outputs - out0, out1
   1194                  Return Type - as per RTYPE
   1195    Details     : Right half of byte elements from 'in0' and 'in1' are
   1196                  interleaved and written to 'out0'
   1197 */
   1198 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) {        \
   1199   out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
   1200   out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
   1201 }
   1202 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
   1203 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
   1204 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
   1205 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
   1206 
   1207 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) {        \
   1208   out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
   1209   out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
   1210 }
   1211 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
   1212 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
   1213 
   1214 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) {        \
   1215   out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
   1216   out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
   1217 }
   1218 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
   1219 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
   1220 
   1221 /* Description : Saturate the halfword element values to the max
   1222                  unsigned value of (sat_val + 1) bits
   1223                  The element data width remains unchanged
   1224    Arguments   : Inputs  - in0, in1, sat_val
   1225                  Outputs - in place operation
   1226                  Return Type - as per RTYPE
   1227    Details     : Each unsigned halfword element from 'in0' is saturated to the
   1228                  value generated with (sat_val + 1) bit range.
   1229                  The results are written in place
   1230 */
   1231 #define SAT_UH2(RTYPE, in0, in1, sat_val) {         \
   1232   in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \
   1233   in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val);  \
   1234 }
   1235 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
   1236 
   1237 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
   1238   SAT_UH2(RTYPE, in0, in1, sat_val);                   \
   1239   SAT_UH2(RTYPE, in2, in3, sat_val)                    \
   1240 }
   1241 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
   1242 
   1243 /* Description : Saturate the halfword element values to the max
   1244                  unsigned value of (sat_val + 1) bits
   1245                  The element data width remains unchanged
   1246    Arguments   : Inputs  - in0, in1, sat_val
   1247                  Outputs - in place operation
   1248                  Return Type - as per RTYPE
   1249    Details     : Each unsigned halfword element from 'in0' is saturated to the
   1250                  value generated with (sat_val + 1) bit range
   1251                  The results are written in place
   1252 */
   1253 #define SAT_SH2(RTYPE, in0, in1, sat_val) {         \
   1254   in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \
   1255   in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val);  \
   1256 }
   1257 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
   1258 
   1259 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
   1260   SAT_SH2(RTYPE, in0, in1, sat_val);                   \
   1261   SAT_SH2(RTYPE, in2, in3, sat_val);                   \
   1262 }
   1263 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
   1264 
   1265 /* Description : Indexed halfword element values are replicated to all
   1266                  elements in output vector
   1267    Arguments   : Inputs  - in, idx0, idx1
   1268                  Outputs - out0, out1
   1269                  Return Type - as per RTYPE
   1270    Details     : 'idx0' element value from 'in' vector is replicated to all
   1271                   elements in 'out0' vector
   1272                   Valid index range for halfword operation is 0-7
   1273 */
   1274 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) {  \
   1275   out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);        \
   1276   out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);        \
   1277 }
   1278 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
   1279 
   1280 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
   1281                   out0, out1, out2, out3) {           \
   1282   SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);       \
   1283   SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);       \
   1284 }
   1285 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
   1286 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
   1287 
   1288 /* Description : Pack even byte elements of vector pairs
   1289    Arguments   : Inputs  - in0, in1, in2, in3
   1290                  Outputs - out0, out1
   1291                  Return Type - as per RTYPE
   1292    Details     : Even byte elements of 'in0' are copied to the left half of
   1293                  'out0' & even byte elements of 'in1' are copied to the right
   1294                  half of 'out0'.
   1295 */
   1296 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   1297   out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \
   1298   out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);     \
   1299 }
   1300 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
   1301 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
   1302 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
   1303 
   1304 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
   1305                  out0, out1, out2, out3) {                       \
   1306   PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
   1307   PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
   1308 }
   1309 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
   1310 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
   1311 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
   1312 
   1313 /* Description : Pack even halfword elements of vector pairs
   1314    Arguments   : Inputs  - in0, in1, in2, in3
   1315                  Outputs - out0, out1
   1316                  Return Type - as per RTYPE
   1317    Details     : Even halfword elements of 'in0' are copied to the left half of
   1318                  'out0' & even halfword elements of 'in1' are copied to the
   1319                  right half of 'out0'.
   1320 */
   1321 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   1322   out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);     \
   1323   out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);     \
   1324 }
   1325 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
   1326 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
   1327 
   1328 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
   1329                  out0, out1, out2, out3) {                       \
   1330   PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
   1331   PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
   1332 }
   1333 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
   1334 
   1335 /* Description : Pack even double word elements of vector pairs
   1336    Arguments   : Inputs  - in0, in1, in2, in3
   1337                  Outputs - out0, out1
   1338                  Return Type - as per RTYPE
   1339    Details     : Even double elements of 'in0' are copied to the left half of
   1340                  'out0' & even double elements of 'in1' are copied to the right
   1341                  half of 'out0'.
   1342 */
   1343 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   1344   out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1);     \
   1345   out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3);     \
   1346 }
   1347 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
   1348 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
   1349 
   1350 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
   1351                  out0, out1, out2, out3) {                       \
   1352   PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
   1353   PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
   1354 }
   1355 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
   1356 
   1357 /* Description : Each byte element is logically xor'ed with immediate 128
   1358    Arguments   : Inputs  - in0, in1
   1359                  Outputs - in place operation
   1360                  Return Type - as per RTYPE
   1361    Details     : Each unsigned byte element from input vector 'in0' is
   1362                  logically xor'ed with 128 and the result is stored in-place.
   1363 */
   1364 #define XORI_B2_128(RTYPE, in0, in1) {         \
   1365   in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128);  \
   1366   in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128);  \
   1367 }
   1368 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
   1369 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
   1370 
   1371 #define XORI_B3_128(RTYPE, in0, in1, in2) {    \
   1372   XORI_B2_128(RTYPE, in0, in1);                \
   1373   in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128);  \
   1374 }
   1375 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
   1376 
   1377 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) {  \
   1378   XORI_B2_128(RTYPE, in0, in1);                   \
   1379   XORI_B2_128(RTYPE, in2, in3);                   \
   1380 }
   1381 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
   1382 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
   1383 
   1384 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) {  \
   1385   XORI_B4_128(RTYPE, in0, in1, in2, in3);                        \
   1386   XORI_B3_128(RTYPE, in4, in5, in6);                             \
   1387 }
   1388 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
   1389 
   1390 /* Description : Average of signed halfword elements -> (a + b) / 2
   1391    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
   1392                  Outputs - out0, out1, out2, out3
   1393                  Return Type - as per RTYPE
   1394    Details     : Each signed halfword element from 'in0' is added to each
   1395                  signed halfword element of 'in1' with full precision resulting
   1396                  in one extra bit in the result. The result is then divided by
   1397                  2 and written to 'out0'
   1398 */
   1399 #define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
   1400                 out0, out1, out2, out3) {                       \
   1401   out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);          \
   1402   out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);          \
   1403   out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);          \
   1404   out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);          \
   1405 }
   1406 #define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
   1407 
   1408 /* Description : Addition of signed halfword elements and signed saturation
   1409    Arguments   : Inputs  - in0, in1, in2, in3
   1410                  Outputs - out0, out1
   1411                  Return Type - as per RTYPE
   1412    Details     : Signed halfword elements from 'in0' are added to signed
   1413                  halfword elements of 'in1'. The result is then signed saturated
   1414                  between halfword data type range
   1415 */
   1416 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
   1417   out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1);    \
   1418   out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3);    \
   1419 }
   1420 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
   1421 
   1422 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
   1423                  out0, out1, out2, out3) {                       \
   1424   ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);               \
   1425   ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);               \
   1426 }
   1427 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
   1428 
   1429 /* Description : Shift left all elements of vector (generic for all data types)
   1430    Arguments   : Inputs  - in0, in1, in2, in3, shift
   1431                  Outputs - in place operation
   1432                  Return Type - as per input vector RTYPE
   1433    Details     : Each element of vector 'in0' is left shifted by 'shift' and
   1434                  the result is written in-place.
   1435 */
   1436 #define SLLI_4V(in0, in1, in2, in3, shift) {  \
   1437   in0 = in0 << shift;                         \
   1438   in1 = in1 << shift;                         \
   1439   in2 = in2 << shift;                         \
   1440   in3 = in3 << shift;                         \
   1441 }
   1442 
   1443 /* Description : Arithmetic shift right all elements of vector
   1444                  (generic for all data types)
   1445    Arguments   : Inputs  - in0, in1, in2, in3, shift
   1446                  Outputs - in place operation
   1447                  Return Type - as per input vector RTYPE
   1448    Details     : Each element of vector 'in0' is right shifted by 'shift' and
   1449                  the result is written in-place. 'shift' is a GP variable.
   1450 */
   1451 #define SRA_4V(in0, in1, in2, in3, shift) {  \
   1452   in0 = in0 >> shift;                        \
   1453   in1 = in1 >> shift;                        \
   1454   in2 = in2 >> shift;                        \
   1455   in3 = in3 >> shift;                        \
   1456 }
   1457 
   1458 /* Description : Shift right arithmetic rounded words
   1459    Arguments   : Inputs  - in0, in1, shift
   1460                  Outputs - in place operation
   1461                  Return Type - as per RTYPE
   1462    Details     : Each element of vector 'in0' is shifted right arithmetically by
   1463                  the number of bits in the corresponding element in the vector
   1464                  'shift'. The last discarded bit is added to shifted value for
   1465                  rounding and the result is written in-place.
   1466                  'shift' is a vector.
   1467 */
   1468 #define SRAR_W2(RTYPE, in0, in1, shift) {               \
   1469   in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift);  \
   1470   in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift);  \
   1471 }
   1472 
   1473 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) {  \
   1474   SRAR_W2(RTYPE, in0, in1, shift)                    \
   1475   SRAR_W2(RTYPE, in2, in3, shift)                    \
   1476 }
   1477 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
   1478 
   1479 /* Description : Shift right arithmetic rounded (immediate)
   1480    Arguments   : Inputs  - in0, in1, shift
   1481                  Outputs - in place operation
   1482                  Return Type - as per RTYPE
   1483    Details     : Each element of vector 'in0' is shifted right arithmetically by
   1484                  the value in 'shift'. The last discarded bit is added to the
   1485                  shifted value for rounding and the result is written in-place.
   1486                  'shift' is an immediate value.
   1487 */
   1488 #define SRARI_H2(RTYPE, in0, in1, shift) {        \
   1489   in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift);  \
   1490   in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift);  \
   1491 }
   1492 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
   1493 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
   1494 
   1495 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) {  \
   1496   SRARI_H2(RTYPE, in0, in1, shift);                   \
   1497   SRARI_H2(RTYPE, in2, in3, shift);                   \
   1498 }
   1499 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
   1500 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
   1501 
   1502 #define SRARI_W2(RTYPE, in0, in1, shift) {        \
   1503   in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
   1504   in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
   1505 }
   1506 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
   1507 
   1508 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) {  \
   1509   SRARI_W2(RTYPE, in0, in1, shift);                   \
   1510   SRARI_W2(RTYPE, in2, in3, shift);                   \
   1511 }
   1512 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
   1513 
   1514 /* Description : Logical shift right all elements of vector (immediate)
   1515    Arguments   : Inputs  - in0, in1, in2, in3, shift
   1516                  Outputs - out0, out1, out2, out3
   1517                  Return Type - as per RTYPE
   1518    Details     : Each element of vector 'in0' is right shifted by 'shift' and
   1519                  the result is written in-place. 'shift' is an immediate value.
   1520 */
   1521 #define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) {  \
   1522   out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                             \
   1523   out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                             \
   1524   out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                             \
   1525   out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                             \
   1526 }
   1527 #define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
   1528 
   1529 /* Description : Multiplication of pairs of vectors
   1530    Arguments   : Inputs  - in0, in1, in2, in3
   1531                  Outputs - out0, out1
   1532    Details     : Each element from 'in0' is multiplied with elements from 'in1'
   1533                  and the result is written to 'out0'
   1534 */
   1535 #define MUL2(in0, in1, in2, in3, out0, out1) {  \
   1536   out0 = in0 * in1;                             \
   1537   out1 = in2 * in3;                             \
   1538 }
   1539 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
   1540              out0, out1, out2, out3) {                \
   1541   MUL2(in0, in1, in2, in3, out0, out1);               \
   1542   MUL2(in4, in5, in6, in7, out2, out3);               \
   1543 }
   1544 
   1545 /* Description : Addition of 2 pairs of vectors
   1546    Arguments   : Inputs  - in0, in1, in2, in3
   1547                  Outputs - out0, out1
   1548    Details     : Each element in 'in0' is added to 'in1' and result is written
   1549                  to 'out0'.
   1550 */
   1551 #define ADD2(in0, in1, in2, in3, out0, out1) {  \
   1552   out0 = in0 + in1;                             \
   1553   out1 = in2 + in3;                             \
   1554 }
   1555 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
   1556              out0, out1, out2, out3) {                \
   1557   ADD2(in0, in1, in2, in3, out0, out1);               \
   1558   ADD2(in4, in5, in6, in7, out2, out3);               \
   1559 }
   1560 
   1561 /* Description : Subtraction of 2 pairs of vectors
   1562    Arguments   : Inputs  - in0, in1, in2, in3
   1563                  Outputs - out0, out1
   1564    Details     : Each element in 'in1' is subtracted from 'in0' and result is
   1565                  written to 'out0'.
   1566 */
   1567 #define SUB2(in0, in1, in2, in3, out0, out1) {  \
   1568   out0 = in0 - in1;                             \
   1569   out1 = in2 - in3;                             \
   1570 }
   1571 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \
   1572              out0, out1, out2, out3) {                \
   1573   out0 = in0 - in1;                                   \
   1574   out1 = in2 - in3;                                   \
   1575   out2 = in4 - in5;                                   \
   1576   out3 = in6 - in7;                                   \
   1577 }
   1578 
   1579 /* Description : Sign extend halfword elements from right half of the vector
   1580    Arguments   : Input  - in    (halfword vector)
   1581                  Output - out   (sign extended word vector)
   1582                  Return Type - signed word
   1583    Details     : Sign bit of halfword elements from input vector 'in' is
   1584                  extracted and interleaved with same vector 'in0' to generate
   1585                  4 word elements keeping sign intact
   1586 */
   1587 #define UNPCK_R_SH_SW(in, out) {                 \
   1588   v8i16 sign_m;                                  \
   1589                                                  \
   1590   sign_m = __msa_clti_s_h((v8i16)in, 0);         \
   1591   out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);  \
   1592 }
   1593 
   1594 /* Description : Zero extend unsigned byte elements to halfword elements
   1595    Arguments   : Input   - in          (unsigned byte vector)
   1596                  Outputs - out0, out1  (unsigned  halfword vectors)
   1597                  Return Type - signed halfword
   1598    Details     : Zero extended right half of vector is returned in 'out0'
   1599                  Zero extended left half of vector is returned in 'out1'
   1600 */
   1601 #define UNPCK_UB_SH(in, out0, out1) {   \
   1602   v16i8 zero_m = { 0 };                 \
   1603                                         \
   1604   ILVRL_B2_SH(zero_m, in, out0, out1);  \
   1605 }
   1606 
   1607 /* Description : Sign extend halfword elements from input vector and return
   1608                  the result in pair of vectors
   1609    Arguments   : Input   - in            (halfword vector)
   1610                  Outputs - out0, out1   (sign extended word vectors)
   1611                  Return Type - signed word
   1612    Details     : Sign bit of halfword elements from input vector 'in' is
   1613                  extracted and interleaved right with same vector 'in0' to
   1614                  generate 4 signed word elements in 'out0'
   1615                  Then interleaved left with same vector 'in0' to
   1616                  generate 4 signed word elements in 'out1'
   1617 */
   1618 #define UNPCK_SH_SW(in, out0, out1) {    \
   1619   v8i16 tmp_m;                           \
   1620                                          \
   1621   tmp_m = __msa_clti_s_h((v8i16)in, 0);  \
   1622   ILVRL_H2_SW(tmp_m, in, out0, out1);    \
   1623 }
   1624 
   1625 /* Description : Butterfly of 4 input vectors
   1626    Arguments   : Inputs  - in0, in1, in2, in3
   1627                  Outputs - out0, out1, out2, out3
   1628    Details     : Butterfly operation
   1629 */
   1630 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
   1631   out0 = in0 + in3;                                                \
   1632   out1 = in1 + in2;                                                \
   1633                                                                    \
   1634   out2 = in1 - in2;                                                \
   1635   out3 = in0 - in3;                                                \
   1636 }
   1637 
   1638 /* Description : Butterfly of 8 input vectors
   1639    Arguments   : Inputs  - in0 ...  in7
   1640                  Outputs - out0 .. out7
   1641    Details     : Butterfly operation
   1642 */
   1643 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,            \
   1644                     out0, out1, out2, out3, out4, out5, out6, out7) {  \
   1645   out0 = in0 + in7;                                                    \
   1646   out1 = in1 + in6;                                                    \
   1647   out2 = in2 + in5;                                                    \
   1648   out3 = in3 + in4;                                                    \
   1649                                                                        \
   1650   out4 = in3 - in4;                                                    \
   1651   out5 = in2 - in5;                                                    \
   1652   out6 = in1 - in6;                                                    \
   1653   out7 = in0 - in7;                                                    \
   1654 }
   1655 
   1656 /* Description : Butterfly of 16 input vectors
   1657    Arguments   : Inputs  - in0 ...  in15
   1658                  Outputs - out0 .. out15
   1659    Details     : Butterfly operation
   1660 */
   1661 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                  \
   1662                      in8, in9,  in10, in11, in12, in13, in14, in15,           \
   1663                      out0, out1, out2, out3, out4, out5, out6, out7,          \
   1664                      out8, out9, out10, out11, out12, out13, out14, out15) {  \
   1665   out0 = in0 + in15;                                                          \
   1666   out1 = in1 + in14;                                                          \
   1667   out2 = in2 + in13;                                                          \
   1668   out3 = in3 + in12;                                                          \
   1669   out4 = in4 + in11;                                                          \
   1670   out5 = in5 + in10;                                                          \
   1671   out6 = in6 + in9;                                                           \
   1672   out7 = in7 + in8;                                                           \
   1673                                                                               \
   1674   out8 = in7 - in8;                                                           \
   1675   out9 = in6 - in9;                                                           \
   1676   out10 = in5 - in10;                                                         \
   1677   out11 = in4 - in11;                                                         \
   1678   out12 = in3 - in12;                                                         \
   1679   out13 = in2 - in13;                                                         \
   1680   out14 = in1 - in14;                                                         \
   1681   out15 = in0 - in15;                                                         \
   1682 }
   1683 
   1684 /* Description : Transpose input 8x8 byte block
   1685    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
   1686                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
   1687                  Return Type - as per RTYPE
   1688 */
   1689 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
   1690                         out0, out1, out2, out3, out4, out5, out6, out7) {  \
   1691   v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
   1692   v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
   1693                                                                            \
   1694   ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                       \
   1695              tmp0_m, tmp1_m, tmp2_m, tmp3_m);                              \
   1696   ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                             \
   1697   ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                             \
   1698   ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                             \
   1699   ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                             \
   1700   SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                             \
   1701   SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                             \
   1702 }
   1703 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
   1704 
   1705 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
   1706    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
   1707                            in8, in9, in10, in11, in12, in13, in14, in15
   1708                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
   1709                  Return Type - unsigned byte
   1710 */
   1711 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
   1712                             in8, in9, in10, in11, in12, in13, in14, in15,      \
   1713                             out0, out1, out2, out3, out4, out5, out6, out7) {  \
   1714   v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                        \
   1715   v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                        \
   1716                                                                                \
   1717   ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                                 \
   1718   ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                               \
   1719   ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                               \
   1720   ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                               \
   1721                                                                                \
   1722   tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                     \
   1723   tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                     \
   1724   tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                     \
   1725   tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                     \
   1726   out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                       \
   1727   tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                     \
   1728   out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                       \
   1729   tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                     \
   1730                                                                                \
   1731   ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                     \
   1732   out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
   1733   out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
   1734                                                                                \
   1735   tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                 \
   1736   tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                     \
   1737   out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
   1738   out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
   1739                                                                                \
   1740   ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);                 \
   1741   out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
   1742   out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
   1743                                                                                \
   1744   tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
   1745   tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
   1746   tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
   1747   tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
   1748   out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
   1749   out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
   1750 }
   1751 
   1752 /* Description : Transpose 4x4 block with half word elements in vectors
   1753    Arguments   : Inputs  - in0, in1, in2, in3
   1754                  Outputs - out0, out1, out2, out3
   1755                  Return Type - signed halfword
   1756 */
   1757 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
   1758   v8i16 s0_m, s1_m;                                                       \
   1759                                                                           \
   1760   ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                             \
   1761   ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                    \
   1762   out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                   \
   1763   out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);                   \
   1764 }
   1765 
   1766 /* Description : Transpose 4x8 block with half word elements in vectors
   1767    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
   1768                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
   1769                  Return Type - signed halfword
   1770 */
   1771 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,            \
   1772                            out0, out1, out2, out3, out4, out5, out6, out7) {  \
   1773   v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                       \
   1774   v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                       \
   1775   v8i16 zero_m = { 0 };                                                       \
   1776                                                                               \
   1777   ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                          \
   1778              tmp0_n, tmp1_n, tmp2_n, tmp3_n);                                 \
   1779   ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                                \
   1780   ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                                \
   1781                                                                               \
   1782   out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
   1783   out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
   1784   out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
   1785   out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
   1786                                                                               \
   1787   out4 = zero_m;                                                              \
   1788   out5 = zero_m;                                                              \
   1789   out6 = zero_m;                                                              \
   1790   out7 = zero_m;                                                              \
   1791 }
   1792 
   1793 /* Description : Transpose 8x4 block with half word elements in vectors
   1794    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
   1795                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
   1796                  Return Type - signed halfword
   1797 */
   1798 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
   1799   v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
   1800                                                                           \
   1801   ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                         \
   1802   ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                         \
   1803   ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);                 \
   1804   ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);                 \
   1805 }
   1806 
   1807 /* Description : Transpose 8x8 block with half word elements in vectors
   1808    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
   1809                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
   1810                  Return Type - as per RTYPE
   1811 */
   1812 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
   1813                        out0, out1, out2, out3, out4, out5, out6, out7) {  \
   1814   v8i16 s0_m, s1_m;                                                       \
   1815   v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
   1816   v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
   1817                                                                           \
   1818   ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
   1819   ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
   1820   ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
   1821   ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
   1822   ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
   1823   ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
   1824   ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
   1825   ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
   1826   PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,         \
   1827            tmp3_m, tmp7_m, out0, out2, out4, out6);                       \
   1828   out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
   1829   out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
   1830   out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
   1831   out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
   1832 }
   1833 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
   1834 
   1835 /* Description : Transpose 4x4 block with word elements in vectors
   1836    Arguments   : Inputs  - in0, in1, in2, in3
   1837                  Outputs - out0, out1, out2, out3
   1838                  Return Type - signed word
   1839 */
   1840 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) {  \
   1841   v4i32 s0_m, s1_m, s2_m, s3_m;                                           \
   1842                                                                           \
   1843   ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                      \
   1844   ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                      \
   1845                                                                           \
   1846   out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);                   \
   1847   out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                   \
   1848   out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                   \
   1849   out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                   \
   1850 }
   1851 
   1852 /* Description : Add block 4x4
   1853    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
   1854    Details     : Least significant 4 bytes from each input vector are added to
   1855                  the destination bytes, clipped between 0-255 and stored.
   1856 */
   1857 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \
   1858   uint32_t src0_m, src1_m, src2_m, src3_m;                      \
   1859   v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
   1860   v16i8 dst0_m = { 0 };                                         \
   1861   v16i8 dst1_m = { 0 };                                         \
   1862   v16i8 zero_m = { 0 };                                         \
   1863                                                                 \
   1864   ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
   1865   LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
   1866   INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
   1867   INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
   1868   ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
   1869   ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
   1870   CLIP_SH2_0_255(res0_m, res1_m);                               \
   1871   PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
   1872   ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \
   1873 }
   1874 
   1875 /* Description : Pack even elements of input vectors & xor with 128
   1876    Arguments   : Inputs - in0, in1
   1877                  Output - out_m
   1878                  Return Type - unsigned byte
   1879    Details     : Signed byte even elements from 'in0' and 'in1' are packed
   1880                  together in one vector and the resulting vector is xor'ed with
   1881                  128 to shift the range from signed to unsigned byte
   1882 */
   1883 #define PCKEV_XORI128_UB(in0, in1) ({                    \
   1884   v16u8 out_m;                                           \
   1885                                                          \
   1886   out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0);  \
   1887   out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);        \
   1888   out_m;                                                 \
   1889 })
   1890 
   1891 /* Description : Converts inputs to unsigned bytes, interleave, average & store
   1892                  as 8x4 unsigned byte block
   1893    Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
   1894                           pdst, stride
   1895 */
   1896 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,                      \
   1897                                 dst0, dst1, dst2, dst3, pdst, stride) {  \
   1898   v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
   1899   uint8_t *pdst_m = (uint8_t *)(pdst);                                   \
   1900                                                                          \
   1901   tmp0_m = PCKEV_XORI128_UB(in0, in1);                                   \
   1902   tmp1_m = PCKEV_XORI128_UB(in2, in3);                                   \
   1903   ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                    \
   1904   AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);           \
   1905   ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                              \
   1906 }
   1907 
   1908 /* Description : Pack even byte elements and store byte vector in destination
   1909                  memory
   1910    Arguments   : Inputs - in0, in1, pdst
   1911 */
   1912 #define PCKEV_ST_SB(in0, in1, pdst) {             \
   1913   v16i8 tmp_m;                                    \
   1914                                                   \
   1915   tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0);  \
   1916   ST_SB(tmp_m, (pdst));                           \
   1917 }
   1918 
   1919 /* Description : Horizontal 2 tap filter kernel code
   1920    Arguments   : Inputs - in0, in1, mask, coeff, shift
   1921 */
   1922 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({    \
   1923   v16i8 tmp0_m;                                                \
   1924   v8u16 tmp1_m;                                                \
   1925                                                                \
   1926   tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0);  \
   1927   tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);        \
   1928   tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);         \
   1929                                                                \
   1930   tmp1_m;                                                      \
   1931 })
   1932 #endif  /* VPX_DSP_MIPS_MACROS_MSA_H_ */
   1933