Home | History | Annotate | Download | only in ppc
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <math.h>
     12 #include <stdlib.h>
     13 #include <string.h>
     14 
     15 #include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
     16 #include "vpx_dsp/ppc/types_vsx.h"
     17 
     18 #include "./vpx_dsp_rtcd.h"
     19 #include "vpx_dsp/inv_txfm.h"
     20 
     21 static int16x8_t cospi1_v = { 16364, 16364, 16364, 16364,
     22                               16364, 16364, 16364, 16364 };
     23 static int16x8_t cospi2_v = { 16305, 16305, 16305, 16305,
     24                               16305, 16305, 16305, 16305 };
     25 static int16x8_t cospi3_v = { 16207, 16207, 16207, 16207,
     26                               16207, 16207, 16207, 16207 };
     27 static int16x8_t cospi4_v = { 16069, 16069, 16069, 16069,
     28                               16069, 16069, 16069, 16069 };
     29 static int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069,
     30                                -16069, -16069, -16069, -16069 };
     31 static int16x8_t cospi5_v = { 15893, 15893, 15893, 15893,
     32                               15893, 15893, 15893, 15893 };
     33 static int16x8_t cospi6_v = { 15679, 15679, 15679, 15679,
     34                               15679, 15679, 15679, 15679 };
     35 static int16x8_t cospi7_v = { 15426, 15426, 15426, 15426,
     36                               15426, 15426, 15426, 15426 };
     37 static int16x8_t cospi8_v = { 15137, 15137, 15137, 15137,
     38                               15137, 15137, 15137, 15137 };
     39 static int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137,
     40                                -15137, -15137, -15137, -15137 };
     41 static int16x8_t cospi9_v = { 14811, 14811, 14811, 14811,
     42                               14811, 14811, 14811, 14811 };
     43 static int16x8_t cospi10_v = { 14449, 14449, 14449, 14449,
     44                                14449, 14449, 14449, 14449 };
     45 static int16x8_t cospi11_v = { 14053, 14053, 14053, 14053,
     46                                14053, 14053, 14053, 14053 };
     47 static int16x8_t cospi12_v = { 13623, 13623, 13623, 13623,
     48                                13623, 13623, 13623, 13623 };
     49 static int16x8_t cospi13_v = { 13160, 13160, 13160, 13160,
     50                                13160, 13160, 13160, 13160 };
     51 static int16x8_t cospi14_v = { 12665, 12665, 12665, 12665,
     52                                12665, 12665, 12665, 12665 };
     53 static int16x8_t cospi15_v = { 12140, 12140, 12140, 12140,
     54                                12140, 12140, 12140, 12140 };
     55 static int16x8_t cospi16_v = { 11585, 11585, 11585, 11585,
     56                                11585, 11585, 11585, 11585 };
     57 static int16x8_t cospi17_v = { 11003, 11003, 11003, 11003,
     58                                11003, 11003, 11003, 11003 };
     59 static int16x8_t cospi18_v = { 10394, 10394, 10394, 10394,
     60                                10394, 10394, 10394, 10394 };
     61 static int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, 9760, 9760, 9760, 9760 };
     62 static int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, 9102, 9102, 9102, 9102 };
     63 static int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102,
     64                                 -9102, -9102, -9102, -9102 };
     65 static int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, 8423, 8423, 8423, 8423 };
     66 static int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, 7723, 7723, 7723, 7723 };
     67 static int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, 7005, 7005, 7005, 7005 };
     68 static int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, 6270, 6270, 6270, 6270 };
     69 static int16x8_t cospi24_mv = { -6270, -6270, -6270, -6270,
     70                                 -6270, -6270, -6270, -6270 };
     71 static int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, 5520, 5520, 5520, 5520 };
     72 static int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, 4756, 4756, 4756, 4756 };
     73 static int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, 3981, 3981, 3981, 3981 };
     74 static int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, 3196, 3196, 3196, 3196 };
     75 static int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 };
     76 static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 };
     77 static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
     78 
     79 #define ROUND_SHIFT_INIT                                               \
     80   const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \
     81   const uint32x4_t shift14 = vec_splat_u32(14);
     82 
     83 #define DCT_CONST_ROUND_SHIFT(vec) vec = vec_sra(vec_add(vec, shift), shift14);
     84 
     85 #define PIXEL_ADD_INIT               \
     86   int16x8_t add8 = vec_splat_s16(8); \
     87   uint16x8_t shift4 = vec_splat_u16(4);
     88 
     89 #define PIXEL_ADD4(out, in) out = vec_sra(vec_add(in, add8), shift4);
     90 
     91 #define IDCT4(in0, in1, out0, out1)                                           \
     92   t0 = vec_add(in0, in1);                                                     \
     93   t1 = vec_sub(in0, in1);                                                     \
     94   tmp16_0 = vec_mergeh(t0, t1);                                               \
     95   temp1 = vec_sra(vec_add(vec_mule(tmp16_0, cospi16_v), shift), shift14);     \
     96   temp2 = vec_sra(vec_add(vec_mulo(tmp16_0, cospi16_v), shift), shift14);     \
     97                                                                               \
     98   tmp16_0 = vec_mergel(in0, in1);                                             \
     99   temp3 = vec_sub(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
    100   DCT_CONST_ROUND_SHIFT(temp3);                                               \
    101   temp4 = vec_add(vec_mule(tmp16_0, cospi8_v), vec_mulo(tmp16_0, cospi24_v)); \
    102   DCT_CONST_ROUND_SHIFT(temp4);                                               \
    103                                                                               \
    104   step0 = vec_packs(temp1, temp2);                                            \
    105   step1 = vec_packs(temp4, temp3);                                            \
    106   out0 = vec_add(step0, step1);                                               \
    107   out1 = vec_sub(step0, step1);                                               \
    108   out1 = vec_perm(out1, out1, mask0);
    109 
    110 void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
    111                             int stride) {
    112   int32x4_t temp1, temp2, temp3, temp4;
    113   int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1;
    114   uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
    115                        0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 };
    116   uint8x16_t mask1 = { 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
    117                        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 };
    118   int16x8_t v0 = load_tran_low(0, input);
    119   int16x8_t v1 = load_tran_low(8 * sizeof(*input), input);
    120   int16x8_t t0 = vec_mergeh(v0, v1);
    121   int16x8_t t1 = vec_mergel(v0, v1);
    122 
    123   uint8x16_t dest0 = vec_vsx_ld(0, dest);
    124   uint8x16_t dest1 = vec_vsx_ld(stride, dest);
    125   uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
    126   uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
    127   uint8x16_t zerov = vec_splat_u8(0);
    128   int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
    129   int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
    130   int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
    131   int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
    132   uint8x16_t output_v;
    133   uint8_t tmp_dest[16];
    134   ROUND_SHIFT_INIT
    135   PIXEL_ADD_INIT;
    136 
    137   v0 = vec_mergeh(t0, t1);
    138   v1 = vec_mergel(t0, t1);
    139 
    140   IDCT4(v0, v1, t_out0, t_out1);
    141   // transpose
    142   t0 = vec_mergeh(t_out0, t_out1);
    143   t1 = vec_mergel(t_out0, t_out1);
    144   v0 = vec_mergeh(t0, t1);
    145   v1 = vec_mergel(t0, t1);
    146   IDCT4(v0, v1, t_out0, t_out1);
    147 
    148   PIXEL_ADD4(v0, t_out0);
    149   PIXEL_ADD4(v1, t_out1);
    150   tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0);
    151   tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1);
    152   output_v = vec_packsu(tmp16_0, tmp16_1);
    153 
    154   vec_vsx_st(output_v, 0, tmp_dest);
    155   for (int i = 0; i < 4; i++)
    156     for (int j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
    157 }
    158 
    159 #define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
    160                      out3, out4, out5, out6, out7)                             \
    161   out0 = vec_mergeh(in0, in1);                                                 \
    162   out1 = vec_mergel(in0, in1);                                                 \
    163   out2 = vec_mergeh(in2, in3);                                                 \
    164   out3 = vec_mergel(in2, in3);                                                 \
    165   out4 = vec_mergeh(in4, in5);                                                 \
    166   out5 = vec_mergel(in4, in5);                                                 \
    167   out6 = vec_mergeh(in6, in7);                                                 \
    168   out7 = vec_mergel(in6, in7);                                                 \
    169   in0 = (int16x8_t)vec_mergeh((int32x4_t)out0, (int32x4_t)out2);               \
    170   in1 = (int16x8_t)vec_mergel((int32x4_t)out0, (int32x4_t)out2);               \
    171   in2 = (int16x8_t)vec_mergeh((int32x4_t)out1, (int32x4_t)out3);               \
    172   in3 = (int16x8_t)vec_mergel((int32x4_t)out1, (int32x4_t)out3);               \
    173   in4 = (int16x8_t)vec_mergeh((int32x4_t)out4, (int32x4_t)out6);               \
    174   in5 = (int16x8_t)vec_mergel((int32x4_t)out4, (int32x4_t)out6);               \
    175   in6 = (int16x8_t)vec_mergeh((int32x4_t)out5, (int32x4_t)out7);               \
    176   in7 = (int16x8_t)vec_mergel((int32x4_t)out5, (int32x4_t)out7);               \
    177   out0 = vec_perm(in0, in4, tr8_mask0);                                        \
    178   out1 = vec_perm(in0, in4, tr8_mask1);                                        \
    179   out2 = vec_perm(in1, in5, tr8_mask0);                                        \
    180   out3 = vec_perm(in1, in5, tr8_mask1);                                        \
    181   out4 = vec_perm(in2, in6, tr8_mask0);                                        \
    182   out5 = vec_perm(in2, in6, tr8_mask1);                                        \
    183   out6 = vec_perm(in3, in7, tr8_mask0);                                        \
    184   out7 = vec_perm(in3, in7, tr8_mask1);
    185 
    186 /* for the: temp1 = step[x] * cospi_q - step[y] * cospi_z
    187  *          temp2 = step[x] * cospi_z + step[y] * cospi_q */
    188 #define STEP8_0(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1)             \
    189   tmp16_0 = vec_mergeh(inpt0, inpt1);                                     \
    190   tmp16_1 = vec_mergel(inpt0, inpt1);                                     \
    191   temp10 = vec_sub(vec_mule(tmp16_0, cospi0), vec_mulo(tmp16_0, cospi1)); \
    192   temp11 = vec_sub(vec_mule(tmp16_1, cospi0), vec_mulo(tmp16_1, cospi1)); \
    193   DCT_CONST_ROUND_SHIFT(temp10);                                          \
    194   DCT_CONST_ROUND_SHIFT(temp11);                                          \
    195   outpt0 = vec_packs(temp10, temp11);                                     \
    196   temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
    197   temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
    198   DCT_CONST_ROUND_SHIFT(temp10);                                          \
    199   DCT_CONST_ROUND_SHIFT(temp11);                                          \
    200   outpt1 = vec_packs(temp10, temp11);
    201 
    202 #define STEP8_1(inpt0, inpt1, outpt0, outpt1, cospi) \
    203   tmp16_2 = vec_sub(inpt0, inpt1);                   \
    204   tmp16_3 = vec_add(inpt0, inpt1);                   \
    205   tmp16_0 = vec_mergeh(tmp16_2, tmp16_3);            \
    206   tmp16_1 = vec_mergel(tmp16_2, tmp16_3);            \
    207   temp10 = vec_mule(tmp16_0, cospi);                 \
    208   temp11 = vec_mule(tmp16_1, cospi);                 \
    209   DCT_CONST_ROUND_SHIFT(temp10);                     \
    210   DCT_CONST_ROUND_SHIFT(temp11);                     \
    211   outpt0 = vec_packs(temp10, temp11);                \
    212   temp10 = vec_mulo(tmp16_0, cospi);                 \
    213   temp11 = vec_mulo(tmp16_1, cospi);                 \
    214   DCT_CONST_ROUND_SHIFT(temp10);                     \
    215   DCT_CONST_ROUND_SHIFT(temp11);                     \
    216   outpt1 = vec_packs(temp10, temp11);
    217 
    218 #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7)    \
    219   /* stage 1 */                                          \
    220   step0 = in0;                                           \
    221   step2 = in4;                                           \
    222   step1 = in2;                                           \
    223   step3 = in6;                                           \
    224                                                          \
    225   STEP8_0(in1, in7, step4, step7, cospi28_v, cospi4_v);  \
    226   STEP8_0(in5, in3, step5, step6, cospi12_v, cospi20_v); \
    227                                                          \
    228   /* stage 2 */                                          \
    229   STEP8_1(step0, step2, in1, in0, cospi16_v);            \
    230   STEP8_0(step1, step3, in2, in3, cospi24_v, cospi8_v);  \
    231   in4 = vec_add(step4, step5);                           \
    232   in5 = vec_sub(step4, step5);                           \
    233   in6 = vec_sub(step7, step6);                           \
    234   in7 = vec_add(step6, step7);                           \
    235                                                          \
    236   /* stage 3 */                                          \
    237   step0 = vec_add(in0, in3);                             \
    238   step1 = vec_add(in1, in2);                             \
    239   step2 = vec_sub(in1, in2);                             \
    240   step3 = vec_sub(in0, in3);                             \
    241   step4 = in4;                                           \
    242   STEP8_1(in6, in5, step5, step6, cospi16_v);            \
    243   step7 = in7;                                           \
    244                                                          \
    245   /* stage 4 */                                          \
    246   in0 = vec_add(step0, step7);                           \
    247   in1 = vec_add(step1, step6);                           \
    248   in2 = vec_add(step2, step5);                           \
    249   in3 = vec_add(step3, step4);                           \
    250   in4 = vec_sub(step3, step4);                           \
    251   in5 = vec_sub(step2, step5);                           \
    252   in6 = vec_sub(step1, step6);                           \
    253   in7 = vec_sub(step0, step7);
    254 
    255 #define PIXEL_ADD(in, out, add, shiftx) \
    256   out = vec_add(vec_sra(vec_add(in, add), shiftx), out);
    257 
    258 static uint8x16_t tr8_mask0 = {
    259   0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
    260   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
    261 };
    262 static uint8x16_t tr8_mask1 = {
    263   0x8,  0x9,  0xA,  0xB,  0xC,  0xD,  0xE,  0xF,
    264   0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F
    265 };
    266 void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
    267                             int stride) {
    268   int32x4_t temp10, temp11;
    269   int16x8_t step0, step1, step2, step3, step4, step5, step6, step7;
    270   int16x8_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp16_0, tmp16_1,
    271       tmp16_2, tmp16_3;
    272   int16x8_t src0 = load_tran_low(0, input);
    273   int16x8_t src1 = load_tran_low(8 * sizeof(*input), input);
    274   int16x8_t src2 = load_tran_low(16 * sizeof(*input), input);
    275   int16x8_t src3 = load_tran_low(24 * sizeof(*input), input);
    276   int16x8_t src4 = load_tran_low(32 * sizeof(*input), input);
    277   int16x8_t src5 = load_tran_low(40 * sizeof(*input), input);
    278   int16x8_t src6 = load_tran_low(48 * sizeof(*input), input);
    279   int16x8_t src7 = load_tran_low(56 * sizeof(*input), input);
    280   uint8x16_t dest0 = vec_vsx_ld(0, dest);
    281   uint8x16_t dest1 = vec_vsx_ld(stride, dest);
    282   uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
    283   uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
    284   uint8x16_t dest4 = vec_vsx_ld(4 * stride, dest);
    285   uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest);
    286   uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest);
    287   uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest);
    288   uint8x16_t zerov = vec_splat_u8(0);
    289   int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
    290   int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
    291   int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
    292   int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
    293   int16x8_t d_u4 = (int16x8_t)vec_mergeh(dest4, zerov);
    294   int16x8_t d_u5 = (int16x8_t)vec_mergeh(dest5, zerov);
    295   int16x8_t d_u6 = (int16x8_t)vec_mergeh(dest6, zerov);
    296   int16x8_t d_u7 = (int16x8_t)vec_mergeh(dest7, zerov);
    297   int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1));
    298   uint16x8_t shift5 = vec_splat_u16(5);
    299   uint8x16_t output0, output1, output2, output3;
    300   ROUND_SHIFT_INIT;
    301 
    302   TRANSPOSE8x8(src0, src1, src2, src3, src4, src5, src6, src7, tmp0, tmp1, tmp2,
    303                tmp3, tmp4, tmp5, tmp6, tmp7);
    304 
    305   IDCT8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
    306   TRANSPOSE8x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, src2,
    307                src3, src4, src5, src6, src7);
    308   IDCT8(src0, src1, src2, src3, src4, src5, src6, src7);
    309   PIXEL_ADD(src0, d_u0, add, shift5);
    310   PIXEL_ADD(src1, d_u1, add, shift5);
    311   PIXEL_ADD(src2, d_u2, add, shift5);
    312   PIXEL_ADD(src3, d_u3, add, shift5);
    313   PIXEL_ADD(src4, d_u4, add, shift5);
    314   PIXEL_ADD(src5, d_u5, add, shift5);
    315   PIXEL_ADD(src6, d_u6, add, shift5);
    316   PIXEL_ADD(src7, d_u7, add, shift5);
    317   output0 = vec_packsu(d_u0, d_u1);
    318   output1 = vec_packsu(d_u2, d_u3);
    319   output2 = vec_packsu(d_u4, d_u5);
    320   output3 = vec_packsu(d_u6, d_u7);
    321 
    322   vec_vsx_st(xxpermdi(output0, dest0, 1), 0, dest);
    323   vec_vsx_st(xxpermdi(output0, dest1, 3), stride, dest);
    324   vec_vsx_st(xxpermdi(output1, dest2, 1), 2 * stride, dest);
    325   vec_vsx_st(xxpermdi(output1, dest3, 3), 3 * stride, dest);
    326   vec_vsx_st(xxpermdi(output2, dest4, 1), 4 * stride, dest);
    327   vec_vsx_st(xxpermdi(output2, dest5, 3), 5 * stride, dest);
    328   vec_vsx_st(xxpermdi(output3, dest6, 1), 6 * stride, dest);
    329   vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest);
    330 }
    331 
    332 #define LOAD_INPUT16(load, source, offset, step, in0, in1, in2, in3, in4, in5, \
    333                      in6, in7, in8, in9, inA, inB, inC, inD, inE, inF)         \
    334   in0 = load(offset, source);                                                  \
    335   in1 = load((step) + (offset), source);                                       \
    336   in2 = load(2 * (step) + (offset), source);                                   \
    337   in3 = load(3 * (step) + (offset), source);                                   \
    338   in4 = load(4 * (step) + (offset), source);                                   \
    339   in5 = load(5 * (step) + (offset), source);                                   \
    340   in6 = load(6 * (step) + (offset), source);                                   \
    341   in7 = load(7 * (step) + (offset), source);                                   \
    342   in8 = load(8 * (step) + (offset), source);                                   \
    343   in9 = load(9 * (step) + (offset), source);                                   \
    344   inA = load(10 * (step) + (offset), source);                                  \
    345   inB = load(11 * (step) + (offset), source);                                  \
    346   inC = load(12 * (step) + (offset), source);                                  \
    347   inD = load(13 * (step) + (offset), source);                                  \
    348   inE = load(14 * (step) + (offset), source);                                  \
    349   inF = load(15 * (step) + (offset), source);
    350 
    351 #define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \
    352   tmp16_0 = vec_mergeh(inpt0, inpt1);                 \
    353   tmp16_1 = vec_mergel(inpt0, inpt1);                 \
    354   temp10 = vec_mule(tmp16_0, cospi);                  \
    355   temp11 = vec_mule(tmp16_1, cospi);                  \
    356   temp20 = vec_mulo(tmp16_0, cospi);                  \
    357   temp21 = vec_mulo(tmp16_1, cospi);                  \
    358   temp30 = vec_sub(temp10, temp20);                   \
    359   temp10 = vec_add(temp10, temp20);                   \
    360   temp20 = vec_sub(temp11, temp21);                   \
    361   temp21 = vec_add(temp11, temp21);                   \
    362   DCT_CONST_ROUND_SHIFT(temp30);                      \
    363   DCT_CONST_ROUND_SHIFT(temp20);                      \
    364   outpt0 = vec_packs(temp30, temp20);                 \
    365   DCT_CONST_ROUND_SHIFT(temp10);                      \
    366   DCT_CONST_ROUND_SHIFT(temp21);                      \
    367   outpt1 = vec_packs(temp10, temp21);
    368 
    369 #define IDCT16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, inA, inB,     \
    370                inC, inD, inE, inF, out0, out1, out2, out3, out4, out5, out6,   \
    371                out7, out8, out9, outA, outB, outC, outD, outE, outF)           \
    372   /* stage 1 */                                                                \
    373   /* out0 = in0; */                                                            \
    374   out1 = in8;                                                                  \
    375   out2 = in4;                                                                  \
    376   out3 = inC;                                                                  \
    377   out4 = in2;                                                                  \
    378   out5 = inA;                                                                  \
    379   out6 = in6;                                                                  \
    380   out7 = inE;                                                                  \
    381   out8 = in1;                                                                  \
    382   out9 = in9;                                                                  \
    383   outA = in5;                                                                  \
    384   outB = inD;                                                                  \
    385   outC = in3;                                                                  \
    386   outD = inB;                                                                  \
    387   outE = in7;                                                                  \
    388   outF = inF;                                                                  \
    389                                                                                \
    390   /* stage 2 */                                                                \
    391   /* in0 = out0; */                                                            \
    392   in1 = out1;                                                                  \
    393   in2 = out2;                                                                  \
    394   in3 = out3;                                                                  \
    395   in4 = out4;                                                                  \
    396   in5 = out5;                                                                  \
    397   in6 = out6;                                                                  \
    398   in7 = out7;                                                                  \
    399                                                                                \
    400   STEP8_0(out8, outF, in8, inF, cospi30_v, cospi2_v);                          \
    401   STEP8_0(out9, outE, in9, inE, cospi14_v, cospi18_v);                         \
    402   STEP8_0(outA, outD, inA, inD, cospi22_v, cospi10_v);                         \
    403   STEP8_0(outB, outC, inB, inC, cospi6_v, cospi26_v);                          \
    404                                                                                \
    405   /* stage 3 */                                                                \
    406   out0 = in0;                                                                  \
    407   out1 = in1;                                                                  \
    408   out2 = in2;                                                                  \
    409   out3 = in3;                                                                  \
    410                                                                                \
    411   STEP8_0(in4, in7, out4, out7, cospi28_v, cospi4_v);                          \
    412   STEP8_0(in5, in6, out5, out6, cospi12_v, cospi20_v);                         \
    413                                                                                \
    414   out8 = vec_add(in8, in9);                                                    \
    415   out9 = vec_sub(in8, in9);                                                    \
    416   outA = vec_sub(inB, inA);                                                    \
    417   outB = vec_add(inA, inB);                                                    \
    418   outC = vec_add(inC, inD);                                                    \
    419   outD = vec_sub(inC, inD);                                                    \
    420   outE = vec_sub(inF, inE);                                                    \
    421   outF = vec_add(inE, inF);                                                    \
    422                                                                                \
    423   /* stage 4 */                                                                \
    424   STEP16_1(out0, out1, in1, in0, cospi16_v);                                   \
    425   STEP8_0(out2, out3, in2, in3, cospi24_v, cospi8_v);                          \
    426   in4 = vec_add(out4, out5);                                                   \
    427   in5 = vec_sub(out4, out5);                                                   \
    428   in6 = vec_sub(out7, out6);                                                   \
    429   in7 = vec_add(out6, out7);                                                   \
    430                                                                                \
    431   in8 = out8;                                                                  \
    432   inF = outF;                                                                  \
    433   tmp16_0 = vec_mergeh(out9, outE);                                            \
    434   tmp16_1 = vec_mergel(out9, outE);                                            \
    435   temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
    436   temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
    437   DCT_CONST_ROUND_SHIFT(temp10);                                               \
    438   DCT_CONST_ROUND_SHIFT(temp11);                                               \
    439   in9 = vec_packs(temp10, temp11);                                             \
    440   temp10 = vec_add(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
    441   temp11 = vec_add(vec_mule(tmp16_1, cospi24_v), vec_mulo(tmp16_1, cospi8_v)); \
    442   DCT_CONST_ROUND_SHIFT(temp10);                                               \
    443   DCT_CONST_ROUND_SHIFT(temp11);                                               \
    444   inE = vec_packs(temp10, temp11);                                             \
    445                                                                                \
    446   tmp16_0 = vec_mergeh(outA, outD);                                            \
    447   tmp16_1 = vec_mergel(outA, outD);                                            \
    448   temp10 =                                                                     \
    449       vec_sub(vec_mule(tmp16_0, cospi24_mv), vec_mulo(tmp16_0, cospi8_v));     \
    450   temp11 =                                                                     \
    451       vec_sub(vec_mule(tmp16_1, cospi24_mv), vec_mulo(tmp16_1, cospi8_v));     \
    452   DCT_CONST_ROUND_SHIFT(temp10);                                               \
    453   DCT_CONST_ROUND_SHIFT(temp11);                                               \
    454   inA = vec_packs(temp10, temp11);                                             \
    455   temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
    456   temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
    457   DCT_CONST_ROUND_SHIFT(temp10);                                               \
    458   DCT_CONST_ROUND_SHIFT(temp11);                                               \
    459   inD = vec_packs(temp10, temp11);                                             \
    460                                                                                \
    461   inB = outB;                                                                  \
    462   inC = outC;                                                                  \
    463                                                                                \
    464   /* stage 5 */                                                                \
    465   out0 = vec_add(in0, in3);                                                    \
    466   out1 = vec_add(in1, in2);                                                    \
    467   out2 = vec_sub(in1, in2);                                                    \
    468   out3 = vec_sub(in0, in3);                                                    \
    469   out4 = in4;                                                                  \
    470   STEP16_1(in6, in5, out5, out6, cospi16_v);                                   \
    471   out7 = in7;                                                                  \
    472                                                                                \
    473   out8 = vec_add(in8, inB);                                                    \
    474   out9 = vec_add(in9, inA);                                                    \
    475   outA = vec_sub(in9, inA);                                                    \
    476   outB = vec_sub(in8, inB);                                                    \
    477   outC = vec_sub(inF, inC);                                                    \
    478   outD = vec_sub(inE, inD);                                                    \
    479   outE = vec_add(inD, inE);                                                    \
    480   outF = vec_add(inC, inF);                                                    \
    481                                                                                \
    482   /* stage 6 */                                                                \
    483   in0 = vec_add(out0, out7);                                                   \
    484   in1 = vec_add(out1, out6);                                                   \
    485   in2 = vec_add(out2, out5);                                                   \
    486   in3 = vec_add(out3, out4);                                                   \
    487   in4 = vec_sub(out3, out4);                                                   \
    488   in5 = vec_sub(out2, out5);                                                   \
    489   in6 = vec_sub(out1, out6);                                                   \
    490   in7 = vec_sub(out0, out7);                                                   \
    491   in8 = out8;                                                                  \
    492   in9 = out9;                                                                  \
    493   STEP16_1(outD, outA, inA, inD, cospi16_v);                                   \
    494   STEP16_1(outC, outB, inB, inC, cospi16_v);                                   \
    495   inE = outE;                                                                  \
    496   inF = outF;                                                                  \
    497                                                                                \
    498   /* stage 7 */                                                                \
    499   out0 = vec_add(in0, inF);                                                    \
    500   out1 = vec_add(in1, inE);                                                    \
    501   out2 = vec_add(in2, inD);                                                    \
    502   out3 = vec_add(in3, inC);                                                    \
    503   out4 = vec_add(in4, inB);                                                    \
    504   out5 = vec_add(in5, inA);                                                    \
    505   out6 = vec_add(in6, in9);                                                    \
    506   out7 = vec_add(in7, in8);                                                    \
    507   out8 = vec_sub(in7, in8);                                                    \
    508   out9 = vec_sub(in6, in9);                                                    \
    509   outA = vec_sub(in5, inA);                                                    \
    510   outB = vec_sub(in4, inB);                                                    \
    511   outC = vec_sub(in3, inC);                                                    \
    512   outD = vec_sub(in2, inD);                                                    \
    513   outE = vec_sub(in1, inE);                                                    \
    514   outF = vec_sub(in0, inF);
    515 
    516 #define PIXEL_ADD_STORE16(in0, in1, dst, offset) \
    517   d_uh = (int16x8_t)vec_mergeh(dst, zerov);      \
    518   d_ul = (int16x8_t)vec_mergel(dst, zerov);      \
    519   PIXEL_ADD(in0, d_uh, add, shift6);             \
    520   PIXEL_ADD(in1, d_ul, add, shift6);             \
    521   vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest);
    522 
    523 void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest,
    524                                int stride) {
    525   int32x4_t temp10, temp11, temp20, temp21, temp30;
    526   int16x8_t src00, src01, src02, src03, src04, src05, src06, src07, src10,
    527       src11, src12, src13, src14, src15, src16, src17;
    528   int16x8_t src20, src21, src22, src23, src24, src25, src26, src27, src30,
    529       src31, src32, src33, src34, src35, src36, src37;
    530   int16x8_t tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10,
    531       tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp16_0, tmp16_1;
    532   int16x8_t tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30,
    533       tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37;
    534   uint8x16_t dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7, dest8,
    535       dest9, destA, destB, destC, destD, destE, destF;
    536   int16x8_t d_uh, d_ul;
    537   int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
    538   uint16x8_t shift6 = vec_splat_u16(6);
    539   uint8x16_t zerov = vec_splat_u8(0);
    540   ROUND_SHIFT_INIT;
    541 
    542   // transform rows
    543   // load and transform the upper half of 16x16 matrix
    544   LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src00, src10, src01,
    545                src11, src02, src12, src03, src13, src04, src14, src05, src15,
    546                src06, src16, src07, src17);
    547   TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00,
    548                tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07);
    549   TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10,
    550                tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17);
    551   IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, tmp11,
    552          tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, src00, src01, src02, src03,
    553          src04, src05, src06, src07, src10, src11, src12, src13, src14, src15,
    554          src16, src17);
    555   TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00,
    556                tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07);
    557   TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10,
    558                tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17);
    559 
    560   // load and transform the lower half of 16x16 matrix
    561   LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input),
    562                8 * sizeof(*input), src20, src30, src21, src31, src22, src32,
    563                src23, src33, src24, src34, src25, src35, src26, src36, src27,
    564                src37);
    565   TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20,
    566                tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27);
    567   TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30,
    568                tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37);
    569   IDCT16(tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, tmp31,
    570          tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src20, src21, src22, src23,
    571          src24, src25, src26, src27, src30, src31, src32, src33, src34, src35,
    572          src36, src37);
    573   TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20,
    574                tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27);
    575   TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30,
    576                tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37);
    577 
    578   // transform columns
    579   // left half first
    580   IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp20, tmp21,
    581          tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, src00, src01, src02, src03,
    582          src04, src05, src06, src07, src20, src21, src22, src23, src24, src25,
    583          src26, src27);
    584   // right half
    585   IDCT16(tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp30, tmp31,
    586          tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src10, src11, src12, src13,
    587          src14, src15, src16, src17, src30, src31, src32, src33, src34, src35,
    588          src36, src37);
    589 
    590   // load dest
    591   LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, dest0, dest1, dest2, dest3, dest4,
    592                dest5, dest6, dest7, dest8, dest9, destA, destB, destC, destD,
    593                destE, destF);
    594 
    595   PIXEL_ADD_STORE16(src00, src10, dest0, 0);
    596   PIXEL_ADD_STORE16(src01, src11, dest1, stride);
    597   PIXEL_ADD_STORE16(src02, src12, dest2, 2 * stride);
    598   PIXEL_ADD_STORE16(src03, src13, dest3, 3 * stride);
    599   PIXEL_ADD_STORE16(src04, src14, dest4, 4 * stride);
    600   PIXEL_ADD_STORE16(src05, src15, dest5, 5 * stride);
    601   PIXEL_ADD_STORE16(src06, src16, dest6, 6 * stride);
    602   PIXEL_ADD_STORE16(src07, src17, dest7, 7 * stride);
    603 
    604   PIXEL_ADD_STORE16(src20, src30, dest8, 8 * stride);
    605   PIXEL_ADD_STORE16(src21, src31, dest9, 9 * stride);
    606   PIXEL_ADD_STORE16(src22, src32, destA, 10 * stride);
    607   PIXEL_ADD_STORE16(src23, src33, destB, 11 * stride);
    608   PIXEL_ADD_STORE16(src24, src34, destC, 12 * stride);
    609   PIXEL_ADD_STORE16(src25, src35, destD, 13 * stride);
    610   PIXEL_ADD_STORE16(src26, src36, destE, 14 * stride);
    611   PIXEL_ADD_STORE16(src27, src37, destF, 15 * stride);
    612 }
    613 
    614 #define LOAD_8x32(load, in00, in01, in02, in03, in10, in11, in12, in13, in20, \
    615                   in21, in22, in23, in30, in31, in32, in33, in40, in41, in42, \
    616                   in43, in50, in51, in52, in53, in60, in61, in62, in63, in70, \
    617                   in71, in72, in73, offset)                                   \
    618   /* load the first row from the 8x32 block*/                                 \
    619   in00 = load(offset, input);                                                 \
    620   in01 = load(offset + 16, input);                                            \
    621   in02 = load(offset + 2 * 16, input);                                        \
    622   in03 = load(offset + 3 * 16, input);                                        \
    623                                                                               \
    624   in10 = load(offset + 4 * 16, input);                                        \
    625   in11 = load(offset + 5 * 16, input);                                        \
    626   in12 = load(offset + 6 * 16, input);                                        \
    627   in13 = load(offset + 7 * 16, input);                                        \
    628                                                                               \
    629   in20 = load(offset + 8 * 16, input);                                        \
    630   in21 = load(offset + 9 * 16, input);                                        \
    631   in22 = load(offset + 10 * 16, input);                                       \
    632   in23 = load(offset + 11 * 16, input);                                       \
    633                                                                               \
    634   in30 = load(offset + 12 * 16, input);                                       \
    635   in31 = load(offset + 13 * 16, input);                                       \
    636   in32 = load(offset + 14 * 16, input);                                       \
    637   in33 = load(offset + 15 * 16, input);                                       \
    638                                                                               \
    639   in40 = load(offset + 16 * 16, input);                                       \
    640   in41 = load(offset + 17 * 16, input);                                       \
    641   in42 = load(offset + 18 * 16, input);                                       \
    642   in43 = load(offset + 19 * 16, input);                                       \
    643                                                                               \
    644   in50 = load(offset + 20 * 16, input);                                       \
    645   in51 = load(offset + 21 * 16, input);                                       \
    646   in52 = load(offset + 22 * 16, input);                                       \
    647   in53 = load(offset + 23 * 16, input);                                       \
    648                                                                               \
    649   in60 = load(offset + 24 * 16, input);                                       \
    650   in61 = load(offset + 25 * 16, input);                                       \
    651   in62 = load(offset + 26 * 16, input);                                       \
    652   in63 = load(offset + 27 * 16, input);                                       \
    653                                                                               \
    654   /* load the last row from the 8x32 block*/                                  \
    655   in70 = load(offset + 28 * 16, input);                                       \
    656   in71 = load(offset + 29 * 16, input);                                       \
    657   in72 = load(offset + 30 * 16, input);                                       \
    658   in73 = load(offset + 31 * 16, input);
    659 
    660 /* for the: temp1 = -step[x] * cospi_q + step[y] * cospi_z
    661  *          temp2 = step[x] * cospi_z + step[y] * cospi_q */
    662 #define STEP32(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1)              \
    663   tmp16_0 = vec_mergeh(inpt0, inpt1);                                     \
    664   tmp16_1 = vec_mergel(inpt0, inpt1);                                     \
    665   temp10 = vec_sub(vec_mulo(tmp16_0, cospi1), vec_mule(tmp16_0, cospi0)); \
    666   temp11 = vec_sub(vec_mulo(tmp16_1, cospi1), vec_mule(tmp16_1, cospi0)); \
    667   DCT_CONST_ROUND_SHIFT(temp10);                                          \
    668   DCT_CONST_ROUND_SHIFT(temp11);                                          \
    669   outpt0 = vec_packs(temp10, temp11);                                     \
    670   temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
    671   temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
    672   DCT_CONST_ROUND_SHIFT(temp10);                                          \
    673   DCT_CONST_ROUND_SHIFT(temp11);                                          \
    674   outpt1 = vec_packs(temp10, temp11);
    675 
    676 /* for the: temp1 = -step[x] * cospi_q - step[y] * cospi_z
    677  *          temp2 = -step[x] * cospi_z + step[y] * cospi_q */
    678 #define STEP32_1(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1, cospi1m)    \
    679   tmp16_0 = vec_mergeh(inpt0, inpt1);                                      \
    680   tmp16_1 = vec_mergel(inpt0, inpt1);                                      \
    681   temp10 = vec_sub(vec_mulo(tmp16_0, cospi1m), vec_mule(tmp16_0, cospi0)); \
    682   temp11 = vec_sub(vec_mulo(tmp16_1, cospi1m), vec_mule(tmp16_1, cospi0)); \
    683   DCT_CONST_ROUND_SHIFT(temp10);                                           \
    684   DCT_CONST_ROUND_SHIFT(temp11);                                           \
    685   outpt0 = vec_packs(temp10, temp11);                                      \
    686   temp10 = vec_sub(vec_mulo(tmp16_0, cospi0), vec_mule(tmp16_0, cospi1));  \
    687   temp11 = vec_sub(vec_mulo(tmp16_1, cospi0), vec_mule(tmp16_1, cospi1));  \
    688   DCT_CONST_ROUND_SHIFT(temp10);                                           \
    689   DCT_CONST_ROUND_SHIFT(temp11);                                           \
    690   outpt1 = vec_packs(temp10, temp11);
    691 
    692 #define IDCT32(in0, in1, in2, in3, out)                                \
    693                                                                        \
    694   /* stage 1 */                                                        \
    695   /* out[0][0] = in[0][0]; */                                          \
    696   out[0][1] = in2[0];                                                  \
    697   out[0][2] = in1[0];                                                  \
    698   out[0][3] = in3[0];                                                  \
    699   out[0][4] = in0[4];                                                  \
    700   out[0][5] = in2[4];                                                  \
    701   out[0][6] = in1[4];                                                  \
    702   out[0][7] = in3[4];                                                  \
    703   out[1][0] = in0[2];                                                  \
    704   out[1][1] = in2[2];                                                  \
    705   out[1][2] = in1[2];                                                  \
    706   out[1][3] = in3[2];                                                  \
    707   out[1][4] = in0[6];                                                  \
    708   out[1][5] = in2[6];                                                  \
    709   out[1][6] = in1[6];                                                  \
    710   out[1][7] = in3[6];                                                  \
    711                                                                        \
    712   STEP8_0(in0[1], in3[7], out[2][0], out[3][7], cospi31_v, cospi1_v);  \
    713   STEP8_0(in2[1], in1[7], out[2][1], out[3][6], cospi15_v, cospi17_v); \
    714   STEP8_0(in1[1], in2[7], out[2][2], out[3][5], cospi23_v, cospi9_v);  \
    715   STEP8_0(in3[1], in0[7], out[2][3], out[3][4], cospi7_v, cospi25_v);  \
    716   STEP8_0(in0[5], in3[3], out[2][4], out[3][3], cospi27_v, cospi5_v);  \
    717   STEP8_0(in2[5], in1[3], out[2][5], out[3][2], cospi11_v, cospi21_v); \
    718   STEP8_0(in1[5], in2[3], out[2][6], out[3][1], cospi19_v, cospi13_v); \
    719   STEP8_0(in3[5], in0[3], out[2][7], out[3][0], cospi3_v, cospi29_v);  \
    720                                                                        \
    721   /* stage 2 */                                                        \
    722   /* in0[0] = out[0][0]; */                                            \
    723   in0[1] = out[0][1];                                                  \
    724   in0[2] = out[0][2];                                                  \
    725   in0[3] = out[0][3];                                                  \
    726   in0[4] = out[0][4];                                                  \
    727   in0[5] = out[0][5];                                                  \
    728   in0[6] = out[0][6];                                                  \
    729   in0[7] = out[0][7];                                                  \
    730                                                                        \
    731   STEP8_0(out[1][0], out[1][7], in1[0], in1[7], cospi30_v, cospi2_v);  \
    732   STEP8_0(out[1][1], out[1][6], in1[1], in1[6], cospi14_v, cospi18_v); \
    733   STEP8_0(out[1][2], out[1][5], in1[2], in1[5], cospi22_v, cospi10_v); \
    734   STEP8_0(out[1][3], out[1][4], in1[3], in1[4], cospi6_v, cospi26_v);  \
    735                                                                        \
    736   in2[0] = vec_add(out[2][0], out[2][1]);                              \
    737   in2[1] = vec_sub(out[2][0], out[2][1]);                              \
    738   in2[2] = vec_sub(out[2][3], out[2][2]);                              \
    739   in2[3] = vec_add(out[2][3], out[2][2]);                              \
    740   in2[4] = vec_add(out[2][4], out[2][5]);                              \
    741   in2[5] = vec_sub(out[2][4], out[2][5]);                              \
    742   in2[6] = vec_sub(out[2][7], out[2][6]);                              \
    743   in2[7] = vec_add(out[2][7], out[2][6]);                              \
    744   in3[0] = vec_add(out[3][0], out[3][1]);                              \
    745   in3[1] = vec_sub(out[3][0], out[3][1]);                              \
    746   in3[2] = vec_sub(out[3][3], out[3][2]);                              \
    747   in3[3] = vec_add(out[3][3], out[3][2]);                              \
    748   in3[4] = vec_add(out[3][4], out[3][5]);                              \
    749   in3[5] = vec_sub(out[3][4], out[3][5]);                              \
    750   in3[6] = vec_sub(out[3][7], out[3][6]);                              \
    751   in3[7] = vec_add(out[3][6], out[3][7]);                              \
    752                                                                        \
    753   /* stage 3 */                                                        \
    754   out[0][0] = in0[0];                                                  \
    755   out[0][1] = in0[1];                                                  \
    756   out[0][2] = in0[2];                                                  \
    757   out[0][3] = in0[3];                                                  \
    758                                                                        \
    759   STEP8_0(in0[4], in0[7], out[0][4], out[0][7], cospi28_v, cospi4_v);  \
    760   STEP8_0(in0[5], in0[6], out[0][5], out[0][6], cospi12_v, cospi20_v); \
    761                                                                        \
    762   out[1][0] = vec_add(in1[0], in1[1]);                                 \
    763   out[1][1] = vec_sub(in1[0], in1[1]);                                 \
    764   out[1][2] = vec_sub(in1[3], in1[2]);                                 \
    765   out[1][3] = vec_add(in1[2], in1[3]);                                 \
    766   out[1][4] = vec_add(in1[4], in1[5]);                                 \
    767   out[1][5] = vec_sub(in1[4], in1[5]);                                 \
    768   out[1][6] = vec_sub(in1[7], in1[6]);                                 \
    769   out[1][7] = vec_add(in1[6], in1[7]);                                 \
    770                                                                        \
    771   out[2][0] = in2[0];                                                  \
    772   out[3][7] = in3[7];                                                  \
    773   STEP32(in2[1], in3[6], out[2][1], out[3][6], cospi4_v, cospi28_v);   \
    774   STEP32_1(in2[2], in3[5], out[2][2], out[3][5], cospi28_v, cospi4_v,  \
    775            cospi4m_v);                                                 \
    776   out[2][3] = in2[3];                                                  \
    777   out[2][4] = in2[4];                                                  \
    778   STEP32(in2[5], in3[2], out[2][5], out[3][2], cospi20_v, cospi12_v);  \
    779   STEP32_1(in2[6], in3[1], out[2][6], out[3][1], cospi12_v, cospi20_v, \
    780            cospi20m_v);                                                \
    781   out[2][7] = in2[7];                                                  \
    782   out[3][0] = in3[0];                                                  \
    783   out[3][3] = in3[3];                                                  \
    784   out[3][4] = in3[4];                                                  \
    785                                                                        \
    786   /* stage 4 */                                                        \
    787   STEP16_1(out[0][0], out[0][1], in0[1], in0[0], cospi16_v);           \
    788   STEP8_0(out[0][2], out[0][3], in0[2], in0[3], cospi24_v, cospi8_v);  \
    789   in0[4] = vec_add(out[0][4], out[0][5]);                              \
    790   in0[5] = vec_sub(out[0][4], out[0][5]);                              \
    791   in0[6] = vec_sub(out[0][7], out[0][6]);                              \
    792   in0[7] = vec_add(out[0][7], out[0][6]);                              \
    793                                                                        \
    794   in1[0] = out[1][0];                                                  \
    795   in1[7] = out[1][7];                                                  \
    796   STEP32(out[1][1], out[1][6], in1[1], in1[6], cospi8_v, cospi24_v);   \
    797   STEP32_1(out[1][2], out[1][5], in1[2], in1[5], cospi24_v, cospi8_v,  \
    798            cospi8m_v);                                                 \
    799   in1[3] = out[1][3];                                                  \
    800   in1[4] = out[1][4];                                                  \
    801                                                                        \
    802   in2[0] = vec_add(out[2][0], out[2][3]);                              \
    803   in2[1] = vec_add(out[2][1], out[2][2]);                              \
    804   in2[2] = vec_sub(out[2][1], out[2][2]);                              \
    805   in2[3] = vec_sub(out[2][0], out[2][3]);                              \
    806   in2[4] = vec_sub(out[2][7], out[2][4]);                              \
    807   in2[5] = vec_sub(out[2][6], out[2][5]);                              \
    808   in2[6] = vec_add(out[2][5], out[2][6]);                              \
    809   in2[7] = vec_add(out[2][4], out[2][7]);                              \
    810                                                                        \
    811   in3[0] = vec_add(out[3][0], out[3][3]);                              \
    812   in3[1] = vec_add(out[3][1], out[3][2]);                              \
    813   in3[2] = vec_sub(out[3][1], out[3][2]);                              \
    814   in3[3] = vec_sub(out[3][0], out[3][3]);                              \
    815   in3[4] = vec_sub(out[3][7], out[3][4]);                              \
    816   in3[5] = vec_sub(out[3][6], out[3][5]);                              \
    817   in3[6] = vec_add(out[3][5], out[3][6]);                              \
    818   in3[7] = vec_add(out[3][4], out[3][7]);                              \
    819                                                                        \
    820   /* stage 5 */                                                        \
    821   out[0][0] = vec_add(in0[0], in0[3]);                                 \
    822   out[0][1] = vec_add(in0[1], in0[2]);                                 \
    823   out[0][2] = vec_sub(in0[1], in0[2]);                                 \
    824   out[0][3] = vec_sub(in0[0], in0[3]);                                 \
    825   out[0][4] = in0[4];                                                  \
    826   STEP16_1(in0[6], in0[5], out[0][5], out[0][6], cospi16_v);           \
    827   out[0][7] = in0[7];                                                  \
    828                                                                        \
    829   out[1][0] = vec_add(in1[0], in1[3]);                                 \
    830   out[1][1] = vec_add(in1[1], in1[2]);                                 \
    831   out[1][2] = vec_sub(in1[1], in1[2]);                                 \
    832   out[1][3] = vec_sub(in1[0], in1[3]);                                 \
    833   out[1][4] = vec_sub(in1[7], in1[4]);                                 \
    834   out[1][5] = vec_sub(in1[6], in1[5]);                                 \
    835   out[1][6] = vec_add(in1[5], in1[6]);                                 \
    836   out[1][7] = vec_add(in1[4], in1[7]);                                 \
    837                                                                        \
    838   out[2][0] = in2[0];                                                  \
    839   out[2][1] = in2[1];                                                  \
    840   STEP32(in2[2], in3[5], out[2][2], out[3][5], cospi8_v, cospi24_v);   \
    841   STEP32(in2[3], in3[4], out[2][3], out[3][4], cospi8_v, cospi24_v);   \
    842   STEP32_1(in2[4], in3[3], out[2][4], out[3][3], cospi24_v, cospi8_v,  \
    843            cospi8m_v);                                                 \
    844   STEP32_1(in2[5], in3[2], out[2][5], out[3][2], cospi24_v, cospi8_v,  \
    845            cospi8m_v);                                                 \
    846   out[2][6] = in2[6];                                                  \
    847   out[2][7] = in2[7];                                                  \
    848   out[3][0] = in3[0];                                                  \
    849   out[3][1] = in3[1];                                                  \
    850   out[3][6] = in3[6];                                                  \
    851   out[3][7] = in3[7];                                                  \
    852                                                                        \
    853   /* stage 6 */                                                        \
    854   in0[0] = vec_add(out[0][0], out[0][7]);                              \
    855   in0[1] = vec_add(out[0][1], out[0][6]);                              \
    856   in0[2] = vec_add(out[0][2], out[0][5]);                              \
    857   in0[3] = vec_add(out[0][3], out[0][4]);                              \
    858   in0[4] = vec_sub(out[0][3], out[0][4]);                              \
    859   in0[5] = vec_sub(out[0][2], out[0][5]);                              \
    860   in0[6] = vec_sub(out[0][1], out[0][6]);                              \
    861   in0[7] = vec_sub(out[0][0], out[0][7]);                              \
    862   in1[0] = out[1][0];                                                  \
    863   in1[1] = out[1][1];                                                  \
    864   STEP16_1(out[1][5], out[1][2], in1[2], in1[5], cospi16_v);           \
    865   STEP16_1(out[1][4], out[1][3], in1[3], in1[4], cospi16_v);           \
    866   in1[6] = out[1][6];                                                  \
    867   in1[7] = out[1][7];                                                  \
    868                                                                        \
    869   in2[0] = vec_add(out[2][0], out[2][7]);                              \
    870   in2[1] = vec_add(out[2][1], out[2][6]);                              \
    871   in2[2] = vec_add(out[2][2], out[2][5]);                              \
    872   in2[3] = vec_add(out[2][3], out[2][4]);                              \
    873   in2[4] = vec_sub(out[2][3], out[2][4]);                              \
    874   in2[5] = vec_sub(out[2][2], out[2][5]);                              \
    875   in2[6] = vec_sub(out[2][1], out[2][6]);                              \
    876   in2[7] = vec_sub(out[2][0], out[2][7]);                              \
    877                                                                        \
    878   in3[0] = vec_sub(out[3][7], out[3][0]);                              \
    879   in3[1] = vec_sub(out[3][6], out[3][1]);                              \
    880   in3[2] = vec_sub(out[3][5], out[3][2]);                              \
    881   in3[3] = vec_sub(out[3][4], out[3][3]);                              \
    882   in3[4] = vec_add(out[3][4], out[3][3]);                              \
    883   in3[5] = vec_add(out[3][5], out[3][2]);                              \
    884   in3[6] = vec_add(out[3][6], out[3][1]);                              \
    885   in3[7] = vec_add(out[3][7], out[3][0]);                              \
    886                                                                        \
    887   /* stage 7 */                                                        \
    888   out[0][0] = vec_add(in0[0], in1[7]);                                 \
    889   out[0][1] = vec_add(in0[1], in1[6]);                                 \
    890   out[0][2] = vec_add(in0[2], in1[5]);                                 \
    891   out[0][3] = vec_add(in0[3], in1[4]);                                 \
    892   out[0][4] = vec_add(in0[4], in1[3]);                                 \
    893   out[0][5] = vec_add(in0[5], in1[2]);                                 \
    894   out[0][6] = vec_add(in0[6], in1[1]);                                 \
    895   out[0][7] = vec_add(in0[7], in1[0]);                                 \
    896   out[1][0] = vec_sub(in0[7], in1[0]);                                 \
    897   out[1][1] = vec_sub(in0[6], in1[1]);                                 \
    898   out[1][2] = vec_sub(in0[5], in1[2]);                                 \
    899   out[1][3] = vec_sub(in0[4], in1[3]);                                 \
    900   out[1][4] = vec_sub(in0[3], in1[4]);                                 \
    901   out[1][5] = vec_sub(in0[2], in1[5]);                                 \
    902   out[1][6] = vec_sub(in0[1], in1[6]);                                 \
    903   out[1][7] = vec_sub(in0[0], in1[7]);                                 \
    904                                                                        \
    905   out[2][0] = in2[0];                                                  \
    906   out[2][1] = in2[1];                                                  \
    907   out[2][2] = in2[2];                                                  \
    908   out[2][3] = in2[3];                                                  \
    909   STEP16_1(in3[3], in2[4], out[2][4], out[3][3], cospi16_v);           \
    910   STEP16_1(in3[2], in2[5], out[2][5], out[3][2], cospi16_v);           \
    911   STEP16_1(in3[1], in2[6], out[2][6], out[3][1], cospi16_v);           \
    912   STEP16_1(in3[0], in2[7], out[2][7], out[3][0], cospi16_v);           \
    913   out[3][4] = in3[4];                                                  \
    914   out[3][5] = in3[5];                                                  \
    915   out[3][6] = in3[6];                                                  \
    916   out[3][7] = in3[7];                                                  \
    917                                                                        \
    918   /* final */                                                          \
    919   in0[0] = vec_add(out[0][0], out[3][7]);                              \
    920   in0[1] = vec_add(out[0][1], out[3][6]);                              \
    921   in0[2] = vec_add(out[0][2], out[3][5]);                              \
    922   in0[3] = vec_add(out[0][3], out[3][4]);                              \
    923   in0[4] = vec_add(out[0][4], out[3][3]);                              \
    924   in0[5] = vec_add(out[0][5], out[3][2]);                              \
    925   in0[6] = vec_add(out[0][6], out[3][1]);                              \
    926   in0[7] = vec_add(out[0][7], out[3][0]);                              \
    927   in1[0] = vec_add(out[1][0], out[2][7]);                              \
    928   in1[1] = vec_add(out[1][1], out[2][6]);                              \
    929   in1[2] = vec_add(out[1][2], out[2][5]);                              \
    930   in1[3] = vec_add(out[1][3], out[2][4]);                              \
    931   in1[4] = vec_add(out[1][4], out[2][3]);                              \
    932   in1[5] = vec_add(out[1][5], out[2][2]);                              \
    933   in1[6] = vec_add(out[1][6], out[2][1]);                              \
    934   in1[7] = vec_add(out[1][7], out[2][0]);                              \
    935   in2[0] = vec_sub(out[1][7], out[2][0]);                              \
    936   in2[1] = vec_sub(out[1][6], out[2][1]);                              \
    937   in2[2] = vec_sub(out[1][5], out[2][2]);                              \
    938   in2[3] = vec_sub(out[1][4], out[2][3]);                              \
    939   in2[4] = vec_sub(out[1][3], out[2][4]);                              \
    940   in2[5] = vec_sub(out[1][2], out[2][5]);                              \
    941   in2[6] = vec_sub(out[1][1], out[2][6]);                              \
    942   in2[7] = vec_sub(out[1][0], out[2][7]);                              \
    943   in3[0] = vec_sub(out[0][7], out[3][0]);                              \
    944   in3[1] = vec_sub(out[0][6], out[3][1]);                              \
    945   in3[2] = vec_sub(out[0][5], out[3][2]);                              \
    946   in3[3] = vec_sub(out[0][4], out[3][3]);                              \
    947   in3[4] = vec_sub(out[0][3], out[3][4]);                              \
    948   in3[5] = vec_sub(out[0][2], out[3][5]);                              \
    949   in3[6] = vec_sub(out[0][1], out[3][6]);                              \
    950   in3[7] = vec_sub(out[0][0], out[3][7]);
    951 
    952 // NOT A FULL TRANSPOSE! Transposes just each 8x8 block in each row,
    953 // does not transpose rows
    954 #define TRANSPOSE_8x32(in, out)                                                \
    955   /* transpose 4 of 8x8 blocks */                                              \
    956   TRANSPOSE8x8(in[0][0], in[0][1], in[0][2], in[0][3], in[0][4], in[0][5],     \
    957                in[0][6], in[0][7], out[0][0], out[0][1], out[0][2], out[0][3], \
    958                out[0][4], out[0][5], out[0][6], out[0][7]);                    \
    959   TRANSPOSE8x8(in[1][0], in[1][1], in[1][2], in[1][3], in[1][4], in[1][5],     \
    960                in[1][6], in[1][7], out[1][0], out[1][1], out[1][2], out[1][3], \
    961                out[1][4], out[1][5], out[1][6], out[1][7]);                    \
    962   TRANSPOSE8x8(in[2][0], in[2][1], in[2][2], in[2][3], in[2][4], in[2][5],     \
    963                in[2][6], in[2][7], out[2][0], out[2][1], out[2][2], out[2][3], \
    964                out[2][4], out[2][5], out[2][6], out[2][7]);                    \
    965   TRANSPOSE8x8(in[3][0], in[3][1], in[3][2], in[3][3], in[3][4], in[3][5],     \
    966                in[3][6], in[3][7], out[3][0], out[3][1], out[3][2], out[3][3], \
    967                out[3][4], out[3][5], out[3][6], out[3][7]);
    968 
    969 #define PIXEL_ADD_STORE32(in0, in1, in2, in3, step)        \
    970   dst = vec_vsx_ld((step)*stride, dest);                   \
    971   d_uh = (int16x8_t)vec_mergeh(dst, zerov);                \
    972   d_ul = (int16x8_t)vec_mergel(dst, zerov);                \
    973   PIXEL_ADD(in0, d_uh, add, shift6);                       \
    974   PIXEL_ADD(in1, d_ul, add, shift6);                       \
    975   vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride, dest); \
    976   dst = vec_vsx_ld((step)*stride + 16, dest);              \
    977   d_uh = (int16x8_t)vec_mergeh(dst, zerov);                \
    978   d_ul = (int16x8_t)vec_mergel(dst, zerov);                \
    979   PIXEL_ADD(in2, d_uh, add, shift6);                       \
    980   PIXEL_ADD(in3, d_ul, add, shift6);                       \
    981   vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride + 16, dest);
    982 
    983 #define ADD_STORE_BLOCK(in, offset)                                      \
    984   PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], offset + 0); \
    985   PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], offset + 1); \
    986   PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], offset + 2); \
    987   PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], offset + 3); \
    988   PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], offset + 4); \
    989   PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], offset + 5); \
    990   PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], offset + 6); \
    991   PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], offset + 7);
    992 
    993 void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest,
    994                                 int stride) {
    995   int16x8_t src0[4][8], src1[4][8], src2[4][8], src3[4][8], tmp[4][8];
    996   int16x8_t tmp16_0, tmp16_1;
    997   int32x4_t temp10, temp11, temp20, temp21, temp30;
    998   uint8x16_t dst;
    999   int16x8_t d_uh, d_ul;
   1000   int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
   1001   uint16x8_t shift6 = vec_splat_u16(6);
   1002   uint8x16_t zerov = vec_splat_u8(0);
   1003 
   1004   ROUND_SHIFT_INIT;
   1005 
   1006   LOAD_8x32(load_tran_low, src0[0][0], src0[1][0], src0[2][0], src0[3][0],
   1007             src0[0][1], src0[1][1], src0[2][1], src0[3][1], src0[0][2],
   1008             src0[1][2], src0[2][2], src0[3][2], src0[0][3], src0[1][3],
   1009             src0[2][3], src0[3][3], src0[0][4], src0[1][4], src0[2][4],
   1010             src0[3][4], src0[0][5], src0[1][5], src0[2][5], src0[3][5],
   1011             src0[0][6], src0[1][6], src0[2][6], src0[3][6], src0[0][7],
   1012             src0[1][7], src0[2][7], src0[3][7], 0);
   1013   // Rows
   1014   // transpose the first row of 8x8 blocks
   1015   TRANSPOSE_8x32(src0, tmp);
   1016   // transform the 32x8 column
   1017   IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src0);
   1018   TRANSPOSE_8x32(tmp, src0);
   1019 
   1020   LOAD_8x32(load_tran_low, src1[0][0], src1[1][0], src1[2][0], src1[3][0],
   1021             src1[0][1], src1[1][1], src1[2][1], src1[3][1], src1[0][2],
   1022             src1[1][2], src1[2][2], src1[3][2], src1[0][3], src1[1][3],
   1023             src1[2][3], src1[3][3], src1[0][4], src1[1][4], src1[2][4],
   1024             src1[3][4], src1[0][5], src1[1][5], src1[2][5], src1[3][5],
   1025             src1[0][6], src1[1][6], src1[2][6], src1[3][6], src1[0][7],
   1026             src1[1][7], src1[2][7], src1[3][7], 512);
   1027   TRANSPOSE_8x32(src1, tmp);
   1028   IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src1);
   1029   TRANSPOSE_8x32(tmp, src1);
   1030 
   1031   LOAD_8x32(load_tran_low, src2[0][0], src2[1][0], src2[2][0], src2[3][0],
   1032             src2[0][1], src2[1][1], src2[2][1], src2[3][1], src2[0][2],
   1033             src2[1][2], src2[2][2], src2[3][2], src2[0][3], src2[1][3],
   1034             src2[2][3], src2[3][3], src2[0][4], src2[1][4], src2[2][4],
   1035             src2[3][4], src2[0][5], src2[1][5], src2[2][5], src2[3][5],
   1036             src2[0][6], src2[1][6], src2[2][6], src2[3][6], src2[0][7],
   1037             src2[1][7], src2[2][7], src2[3][7], 1024);
   1038   TRANSPOSE_8x32(src2, tmp);
   1039   IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src2);
   1040   TRANSPOSE_8x32(tmp, src2);
   1041 
   1042   LOAD_8x32(load_tran_low, src3[0][0], src3[1][0], src3[2][0], src3[3][0],
   1043             src3[0][1], src3[1][1], src3[2][1], src3[3][1], src3[0][2],
   1044             src3[1][2], src3[2][2], src3[3][2], src3[0][3], src3[1][3],
   1045             src3[2][3], src3[3][3], src3[0][4], src3[1][4], src3[2][4],
   1046             src3[3][4], src3[0][5], src3[1][5], src3[2][5], src3[3][5],
   1047             src3[0][6], src3[1][6], src3[2][6], src3[3][6], src3[0][7],
   1048             src3[1][7], src3[2][7], src3[3][7], 1536);
   1049   TRANSPOSE_8x32(src3, tmp);
   1050   IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src3);
   1051   TRANSPOSE_8x32(tmp, src3);
   1052 
   1053   // Columns
   1054   IDCT32(src0[0], src1[0], src2[0], src3[0], tmp);
   1055   IDCT32(src0[1], src1[1], src2[1], src3[1], tmp);
   1056   IDCT32(src0[2], src1[2], src2[2], src3[2], tmp);
   1057   IDCT32(src0[3], src1[3], src2[3], src3[3], tmp);
   1058 
   1059   ADD_STORE_BLOCK(src0, 0);
   1060   ADD_STORE_BLOCK(src1, 8);
   1061   ADD_STORE_BLOCK(src2, 16);
   1062   ADD_STORE_BLOCK(src3, 24);
   1063 }
   1064