1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <math.h> 12 #include <stdlib.h> 13 #include <string.h> 14 15 #include "vpx_dsp/ppc/bitdepth_conversion_vsx.h" 16 #include "vpx_dsp/ppc/types_vsx.h" 17 18 #include "./vpx_dsp_rtcd.h" 19 #include "vpx_dsp/inv_txfm.h" 20 21 static int16x8_t cospi1_v = { 16364, 16364, 16364, 16364, 22 16364, 16364, 16364, 16364 }; 23 static int16x8_t cospi2_v = { 16305, 16305, 16305, 16305, 24 16305, 16305, 16305, 16305 }; 25 static int16x8_t cospi3_v = { 16207, 16207, 16207, 16207, 26 16207, 16207, 16207, 16207 }; 27 static int16x8_t cospi4_v = { 16069, 16069, 16069, 16069, 28 16069, 16069, 16069, 16069 }; 29 static int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069, 30 -16069, -16069, -16069, -16069 }; 31 static int16x8_t cospi5_v = { 15893, 15893, 15893, 15893, 32 15893, 15893, 15893, 15893 }; 33 static int16x8_t cospi6_v = { 15679, 15679, 15679, 15679, 34 15679, 15679, 15679, 15679 }; 35 static int16x8_t cospi7_v = { 15426, 15426, 15426, 15426, 36 15426, 15426, 15426, 15426 }; 37 static int16x8_t cospi8_v = { 15137, 15137, 15137, 15137, 38 15137, 15137, 15137, 15137 }; 39 static int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137, 40 -15137, -15137, -15137, -15137 }; 41 static int16x8_t cospi9_v = { 14811, 14811, 14811, 14811, 42 14811, 14811, 14811, 14811 }; 43 static int16x8_t cospi10_v = { 14449, 14449, 14449, 14449, 44 14449, 14449, 14449, 14449 }; 45 static int16x8_t cospi11_v = { 14053, 14053, 14053, 14053, 46 14053, 14053, 14053, 14053 }; 47 static int16x8_t cospi12_v = { 13623, 13623, 13623, 13623, 48 13623, 13623, 13623, 13623 }; 49 static int16x8_t cospi13_v = { 13160, 13160, 13160, 13160, 50 13160, 13160, 13160, 13160 }; 51 static int16x8_t cospi14_v = { 12665, 12665, 12665, 12665, 52 12665, 12665, 12665, 12665 }; 53 static int16x8_t cospi15_v = { 12140, 12140, 12140, 12140, 54 12140, 12140, 12140, 12140 }; 55 static int16x8_t cospi16_v = { 11585, 11585, 11585, 11585, 56 11585, 11585, 11585, 11585 }; 57 static int16x8_t cospi17_v = { 11003, 11003, 11003, 11003, 58 11003, 11003, 11003, 11003 }; 59 static int16x8_t cospi18_v = { 10394, 10394, 10394, 10394, 60 10394, 10394, 10394, 10394 }; 61 static int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, 9760, 9760, 9760, 9760 }; 62 static int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, 9102, 9102, 9102, 9102 }; 63 static int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102, 64 -9102, -9102, -9102, -9102 }; 65 static int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, 8423, 8423, 8423, 8423 }; 66 static int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, 7723, 7723, 7723, 7723 }; 67 static int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, 7005, 7005, 7005, 7005 }; 68 static int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, 6270, 6270, 6270, 6270 }; 69 static int16x8_t cospi24_mv = { -6270, -6270, -6270, -6270, 70 -6270, -6270, -6270, -6270 }; 71 static int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, 5520, 5520, 5520, 5520 }; 72 static int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, 4756, 4756, 4756, 4756 }; 73 static int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, 3981, 3981, 3981, 3981 }; 74 static int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, 3196, 3196, 3196, 3196 }; 75 static int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 }; 76 static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 }; 77 static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; 78 79 #define ROUND_SHIFT_INIT \ 80 const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \ 81 const uint32x4_t shift14 = vec_splat_u32(14); 82 83 #define DCT_CONST_ROUND_SHIFT(vec) vec = vec_sra(vec_add(vec, shift), shift14); 84 85 #define PIXEL_ADD_INIT \ 86 int16x8_t add8 = vec_splat_s16(8); \ 87 uint16x8_t shift4 = vec_splat_u16(4); 88 89 #define PIXEL_ADD4(out, in) out = vec_sra(vec_add(in, add8), shift4); 90 91 #define IDCT4(in0, in1, out0, out1) \ 92 t0 = vec_add(in0, in1); \ 93 t1 = vec_sub(in0, in1); \ 94 tmp16_0 = vec_mergeh(t0, t1); \ 95 temp1 = vec_sra(vec_add(vec_mule(tmp16_0, cospi16_v), shift), shift14); \ 96 temp2 = vec_sra(vec_add(vec_mulo(tmp16_0, cospi16_v), shift), shift14); \ 97 \ 98 tmp16_0 = vec_mergel(in0, in1); \ 99 temp3 = vec_sub(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \ 100 DCT_CONST_ROUND_SHIFT(temp3); \ 101 temp4 = vec_add(vec_mule(tmp16_0, cospi8_v), vec_mulo(tmp16_0, cospi24_v)); \ 102 DCT_CONST_ROUND_SHIFT(temp4); \ 103 \ 104 step0 = vec_packs(temp1, temp2); \ 105 step1 = vec_packs(temp4, temp3); \ 106 out0 = vec_add(step0, step1); \ 107 out1 = vec_sub(step0, step1); \ 108 out1 = vec_perm(out1, out1, mask0); 109 110 void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, 111 int stride) { 112 int32x4_t temp1, temp2, temp3, temp4; 113 int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1; 114 uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 115 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 }; 116 uint8x16_t mask1 = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 117 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 }; 118 int16x8_t v0 = load_tran_low(0, input); 119 int16x8_t v1 = load_tran_low(8 * sizeof(*input), input); 120 int16x8_t t0 = vec_mergeh(v0, v1); 121 int16x8_t t1 = vec_mergel(v0, v1); 122 123 uint8x16_t dest0 = vec_vsx_ld(0, dest); 124 uint8x16_t dest1 = vec_vsx_ld(stride, dest); 125 uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); 126 uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest); 127 uint8x16_t zerov = vec_splat_u8(0); 128 int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov); 129 int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov); 130 int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov); 131 int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov); 132 uint8x16_t output_v; 133 uint8_t tmp_dest[16]; 134 ROUND_SHIFT_INIT 135 PIXEL_ADD_INIT; 136 137 v0 = vec_mergeh(t0, t1); 138 v1 = vec_mergel(t0, t1); 139 140 IDCT4(v0, v1, t_out0, t_out1); 141 // transpose 142 t0 = vec_mergeh(t_out0, t_out1); 143 t1 = vec_mergel(t_out0, t_out1); 144 v0 = vec_mergeh(t0, t1); 145 v1 = vec_mergel(t0, t1); 146 IDCT4(v0, v1, t_out0, t_out1); 147 148 PIXEL_ADD4(v0, t_out0); 149 PIXEL_ADD4(v1, t_out1); 150 tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0); 151 tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1); 152 output_v = vec_packsu(tmp16_0, tmp16_1); 153 154 vec_vsx_st(output_v, 0, tmp_dest); 155 for (int i = 0; i < 4; i++) 156 for (int j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i]; 157 } 158 159 #define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ 160 out3, out4, out5, out6, out7) \ 161 out0 = vec_mergeh(in0, in1); \ 162 out1 = vec_mergel(in0, in1); \ 163 out2 = vec_mergeh(in2, in3); \ 164 out3 = vec_mergel(in2, in3); \ 165 out4 = vec_mergeh(in4, in5); \ 166 out5 = vec_mergel(in4, in5); \ 167 out6 = vec_mergeh(in6, in7); \ 168 out7 = vec_mergel(in6, in7); \ 169 in0 = (int16x8_t)vec_mergeh((int32x4_t)out0, (int32x4_t)out2); \ 170 in1 = (int16x8_t)vec_mergel((int32x4_t)out0, (int32x4_t)out2); \ 171 in2 = (int16x8_t)vec_mergeh((int32x4_t)out1, (int32x4_t)out3); \ 172 in3 = (int16x8_t)vec_mergel((int32x4_t)out1, (int32x4_t)out3); \ 173 in4 = (int16x8_t)vec_mergeh((int32x4_t)out4, (int32x4_t)out6); \ 174 in5 = (int16x8_t)vec_mergel((int32x4_t)out4, (int32x4_t)out6); \ 175 in6 = (int16x8_t)vec_mergeh((int32x4_t)out5, (int32x4_t)out7); \ 176 in7 = (int16x8_t)vec_mergel((int32x4_t)out5, (int32x4_t)out7); \ 177 out0 = vec_perm(in0, in4, tr8_mask0); \ 178 out1 = vec_perm(in0, in4, tr8_mask1); \ 179 out2 = vec_perm(in1, in5, tr8_mask0); \ 180 out3 = vec_perm(in1, in5, tr8_mask1); \ 181 out4 = vec_perm(in2, in6, tr8_mask0); \ 182 out5 = vec_perm(in2, in6, tr8_mask1); \ 183 out6 = vec_perm(in3, in7, tr8_mask0); \ 184 out7 = vec_perm(in3, in7, tr8_mask1); 185 186 /* for the: temp1 = step[x] * cospi_q - step[y] * cospi_z 187 * temp2 = step[x] * cospi_z + step[y] * cospi_q */ 188 #define STEP8_0(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1) \ 189 tmp16_0 = vec_mergeh(inpt0, inpt1); \ 190 tmp16_1 = vec_mergel(inpt0, inpt1); \ 191 temp10 = vec_sub(vec_mule(tmp16_0, cospi0), vec_mulo(tmp16_0, cospi1)); \ 192 temp11 = vec_sub(vec_mule(tmp16_1, cospi0), vec_mulo(tmp16_1, cospi1)); \ 193 DCT_CONST_ROUND_SHIFT(temp10); \ 194 DCT_CONST_ROUND_SHIFT(temp11); \ 195 outpt0 = vec_packs(temp10, temp11); \ 196 temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \ 197 temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \ 198 DCT_CONST_ROUND_SHIFT(temp10); \ 199 DCT_CONST_ROUND_SHIFT(temp11); \ 200 outpt1 = vec_packs(temp10, temp11); 201 202 #define STEP8_1(inpt0, inpt1, outpt0, outpt1, cospi) \ 203 tmp16_2 = vec_sub(inpt0, inpt1); \ 204 tmp16_3 = vec_add(inpt0, inpt1); \ 205 tmp16_0 = vec_mergeh(tmp16_2, tmp16_3); \ 206 tmp16_1 = vec_mergel(tmp16_2, tmp16_3); \ 207 temp10 = vec_mule(tmp16_0, cospi); \ 208 temp11 = vec_mule(tmp16_1, cospi); \ 209 DCT_CONST_ROUND_SHIFT(temp10); \ 210 DCT_CONST_ROUND_SHIFT(temp11); \ 211 outpt0 = vec_packs(temp10, temp11); \ 212 temp10 = vec_mulo(tmp16_0, cospi); \ 213 temp11 = vec_mulo(tmp16_1, cospi); \ 214 DCT_CONST_ROUND_SHIFT(temp10); \ 215 DCT_CONST_ROUND_SHIFT(temp11); \ 216 outpt1 = vec_packs(temp10, temp11); 217 218 #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7) \ 219 /* stage 1 */ \ 220 step0 = in0; \ 221 step2 = in4; \ 222 step1 = in2; \ 223 step3 = in6; \ 224 \ 225 STEP8_0(in1, in7, step4, step7, cospi28_v, cospi4_v); \ 226 STEP8_0(in5, in3, step5, step6, cospi12_v, cospi20_v); \ 227 \ 228 /* stage 2 */ \ 229 STEP8_1(step0, step2, in1, in0, cospi16_v); \ 230 STEP8_0(step1, step3, in2, in3, cospi24_v, cospi8_v); \ 231 in4 = vec_add(step4, step5); \ 232 in5 = vec_sub(step4, step5); \ 233 in6 = vec_sub(step7, step6); \ 234 in7 = vec_add(step6, step7); \ 235 \ 236 /* stage 3 */ \ 237 step0 = vec_add(in0, in3); \ 238 step1 = vec_add(in1, in2); \ 239 step2 = vec_sub(in1, in2); \ 240 step3 = vec_sub(in0, in3); \ 241 step4 = in4; \ 242 STEP8_1(in6, in5, step5, step6, cospi16_v); \ 243 step7 = in7; \ 244 \ 245 /* stage 4 */ \ 246 in0 = vec_add(step0, step7); \ 247 in1 = vec_add(step1, step6); \ 248 in2 = vec_add(step2, step5); \ 249 in3 = vec_add(step3, step4); \ 250 in4 = vec_sub(step3, step4); \ 251 in5 = vec_sub(step2, step5); \ 252 in6 = vec_sub(step1, step6); \ 253 in7 = vec_sub(step0, step7); 254 255 #define PIXEL_ADD(in, out, add, shiftx) \ 256 out = vec_add(vec_sra(vec_add(in, add), shiftx), out); 257 258 static uint8x16_t tr8_mask0 = { 259 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 260 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 261 }; 262 static uint8x16_t tr8_mask1 = { 263 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 264 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F 265 }; 266 void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, 267 int stride) { 268 int32x4_t temp10, temp11; 269 int16x8_t step0, step1, step2, step3, step4, step5, step6, step7; 270 int16x8_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp16_0, tmp16_1, 271 tmp16_2, tmp16_3; 272 int16x8_t src0 = load_tran_low(0, input); 273 int16x8_t src1 = load_tran_low(8 * sizeof(*input), input); 274 int16x8_t src2 = load_tran_low(16 * sizeof(*input), input); 275 int16x8_t src3 = load_tran_low(24 * sizeof(*input), input); 276 int16x8_t src4 = load_tran_low(32 * sizeof(*input), input); 277 int16x8_t src5 = load_tran_low(40 * sizeof(*input), input); 278 int16x8_t src6 = load_tran_low(48 * sizeof(*input), input); 279 int16x8_t src7 = load_tran_low(56 * sizeof(*input), input); 280 uint8x16_t dest0 = vec_vsx_ld(0, dest); 281 uint8x16_t dest1 = vec_vsx_ld(stride, dest); 282 uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); 283 uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest); 284 uint8x16_t dest4 = vec_vsx_ld(4 * stride, dest); 285 uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest); 286 uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest); 287 uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest); 288 uint8x16_t zerov = vec_splat_u8(0); 289 int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov); 290 int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov); 291 int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov); 292 int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov); 293 int16x8_t d_u4 = (int16x8_t)vec_mergeh(dest4, zerov); 294 int16x8_t d_u5 = (int16x8_t)vec_mergeh(dest5, zerov); 295 int16x8_t d_u6 = (int16x8_t)vec_mergeh(dest6, zerov); 296 int16x8_t d_u7 = (int16x8_t)vec_mergeh(dest7, zerov); 297 int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1)); 298 uint16x8_t shift5 = vec_splat_u16(5); 299 uint8x16_t output0, output1, output2, output3; 300 ROUND_SHIFT_INIT; 301 302 TRANSPOSE8x8(src0, src1, src2, src3, src4, src5, src6, src7, tmp0, tmp1, tmp2, 303 tmp3, tmp4, tmp5, tmp6, tmp7); 304 305 IDCT8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 306 TRANSPOSE8x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, src2, 307 src3, src4, src5, src6, src7); 308 IDCT8(src0, src1, src2, src3, src4, src5, src6, src7); 309 PIXEL_ADD(src0, d_u0, add, shift5); 310 PIXEL_ADD(src1, d_u1, add, shift5); 311 PIXEL_ADD(src2, d_u2, add, shift5); 312 PIXEL_ADD(src3, d_u3, add, shift5); 313 PIXEL_ADD(src4, d_u4, add, shift5); 314 PIXEL_ADD(src5, d_u5, add, shift5); 315 PIXEL_ADD(src6, d_u6, add, shift5); 316 PIXEL_ADD(src7, d_u7, add, shift5); 317 output0 = vec_packsu(d_u0, d_u1); 318 output1 = vec_packsu(d_u2, d_u3); 319 output2 = vec_packsu(d_u4, d_u5); 320 output3 = vec_packsu(d_u6, d_u7); 321 322 vec_vsx_st(xxpermdi(output0, dest0, 1), 0, dest); 323 vec_vsx_st(xxpermdi(output0, dest1, 3), stride, dest); 324 vec_vsx_st(xxpermdi(output1, dest2, 1), 2 * stride, dest); 325 vec_vsx_st(xxpermdi(output1, dest3, 3), 3 * stride, dest); 326 vec_vsx_st(xxpermdi(output2, dest4, 1), 4 * stride, dest); 327 vec_vsx_st(xxpermdi(output2, dest5, 3), 5 * stride, dest); 328 vec_vsx_st(xxpermdi(output3, dest6, 1), 6 * stride, dest); 329 vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest); 330 } 331 332 #define LOAD_INPUT16(load, source, offset, step, in0, in1, in2, in3, in4, in5, \ 333 in6, in7, in8, in9, inA, inB, inC, inD, inE, inF) \ 334 in0 = load(offset, source); \ 335 in1 = load((step) + (offset), source); \ 336 in2 = load(2 * (step) + (offset), source); \ 337 in3 = load(3 * (step) + (offset), source); \ 338 in4 = load(4 * (step) + (offset), source); \ 339 in5 = load(5 * (step) + (offset), source); \ 340 in6 = load(6 * (step) + (offset), source); \ 341 in7 = load(7 * (step) + (offset), source); \ 342 in8 = load(8 * (step) + (offset), source); \ 343 in9 = load(9 * (step) + (offset), source); \ 344 inA = load(10 * (step) + (offset), source); \ 345 inB = load(11 * (step) + (offset), source); \ 346 inC = load(12 * (step) + (offset), source); \ 347 inD = load(13 * (step) + (offset), source); \ 348 inE = load(14 * (step) + (offset), source); \ 349 inF = load(15 * (step) + (offset), source); 350 351 #define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \ 352 tmp16_0 = vec_mergeh(inpt0, inpt1); \ 353 tmp16_1 = vec_mergel(inpt0, inpt1); \ 354 temp10 = vec_mule(tmp16_0, cospi); \ 355 temp11 = vec_mule(tmp16_1, cospi); \ 356 temp20 = vec_mulo(tmp16_0, cospi); \ 357 temp21 = vec_mulo(tmp16_1, cospi); \ 358 temp30 = vec_sub(temp10, temp20); \ 359 temp10 = vec_add(temp10, temp20); \ 360 temp20 = vec_sub(temp11, temp21); \ 361 temp21 = vec_add(temp11, temp21); \ 362 DCT_CONST_ROUND_SHIFT(temp30); \ 363 DCT_CONST_ROUND_SHIFT(temp20); \ 364 outpt0 = vec_packs(temp30, temp20); \ 365 DCT_CONST_ROUND_SHIFT(temp10); \ 366 DCT_CONST_ROUND_SHIFT(temp21); \ 367 outpt1 = vec_packs(temp10, temp21); 368 369 #define IDCT16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, inA, inB, \ 370 inC, inD, inE, inF, out0, out1, out2, out3, out4, out5, out6, \ 371 out7, out8, out9, outA, outB, outC, outD, outE, outF) \ 372 /* stage 1 */ \ 373 /* out0 = in0; */ \ 374 out1 = in8; \ 375 out2 = in4; \ 376 out3 = inC; \ 377 out4 = in2; \ 378 out5 = inA; \ 379 out6 = in6; \ 380 out7 = inE; \ 381 out8 = in1; \ 382 out9 = in9; \ 383 outA = in5; \ 384 outB = inD; \ 385 outC = in3; \ 386 outD = inB; \ 387 outE = in7; \ 388 outF = inF; \ 389 \ 390 /* stage 2 */ \ 391 /* in0 = out0; */ \ 392 in1 = out1; \ 393 in2 = out2; \ 394 in3 = out3; \ 395 in4 = out4; \ 396 in5 = out5; \ 397 in6 = out6; \ 398 in7 = out7; \ 399 \ 400 STEP8_0(out8, outF, in8, inF, cospi30_v, cospi2_v); \ 401 STEP8_0(out9, outE, in9, inE, cospi14_v, cospi18_v); \ 402 STEP8_0(outA, outD, inA, inD, cospi22_v, cospi10_v); \ 403 STEP8_0(outB, outC, inB, inC, cospi6_v, cospi26_v); \ 404 \ 405 /* stage 3 */ \ 406 out0 = in0; \ 407 out1 = in1; \ 408 out2 = in2; \ 409 out3 = in3; \ 410 \ 411 STEP8_0(in4, in7, out4, out7, cospi28_v, cospi4_v); \ 412 STEP8_0(in5, in6, out5, out6, cospi12_v, cospi20_v); \ 413 \ 414 out8 = vec_add(in8, in9); \ 415 out9 = vec_sub(in8, in9); \ 416 outA = vec_sub(inB, inA); \ 417 outB = vec_add(inA, inB); \ 418 outC = vec_add(inC, inD); \ 419 outD = vec_sub(inC, inD); \ 420 outE = vec_sub(inF, inE); \ 421 outF = vec_add(inE, inF); \ 422 \ 423 /* stage 4 */ \ 424 STEP16_1(out0, out1, in1, in0, cospi16_v); \ 425 STEP8_0(out2, out3, in2, in3, cospi24_v, cospi8_v); \ 426 in4 = vec_add(out4, out5); \ 427 in5 = vec_sub(out4, out5); \ 428 in6 = vec_sub(out7, out6); \ 429 in7 = vec_add(out6, out7); \ 430 \ 431 in8 = out8; \ 432 inF = outF; \ 433 tmp16_0 = vec_mergeh(out9, outE); \ 434 tmp16_1 = vec_mergel(out9, outE); \ 435 temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \ 436 temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \ 437 DCT_CONST_ROUND_SHIFT(temp10); \ 438 DCT_CONST_ROUND_SHIFT(temp11); \ 439 in9 = vec_packs(temp10, temp11); \ 440 temp10 = vec_add(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \ 441 temp11 = vec_add(vec_mule(tmp16_1, cospi24_v), vec_mulo(tmp16_1, cospi8_v)); \ 442 DCT_CONST_ROUND_SHIFT(temp10); \ 443 DCT_CONST_ROUND_SHIFT(temp11); \ 444 inE = vec_packs(temp10, temp11); \ 445 \ 446 tmp16_0 = vec_mergeh(outA, outD); \ 447 tmp16_1 = vec_mergel(outA, outD); \ 448 temp10 = \ 449 vec_sub(vec_mule(tmp16_0, cospi24_mv), vec_mulo(tmp16_0, cospi8_v)); \ 450 temp11 = \ 451 vec_sub(vec_mule(tmp16_1, cospi24_mv), vec_mulo(tmp16_1, cospi8_v)); \ 452 DCT_CONST_ROUND_SHIFT(temp10); \ 453 DCT_CONST_ROUND_SHIFT(temp11); \ 454 inA = vec_packs(temp10, temp11); \ 455 temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \ 456 temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \ 457 DCT_CONST_ROUND_SHIFT(temp10); \ 458 DCT_CONST_ROUND_SHIFT(temp11); \ 459 inD = vec_packs(temp10, temp11); \ 460 \ 461 inB = outB; \ 462 inC = outC; \ 463 \ 464 /* stage 5 */ \ 465 out0 = vec_add(in0, in3); \ 466 out1 = vec_add(in1, in2); \ 467 out2 = vec_sub(in1, in2); \ 468 out3 = vec_sub(in0, in3); \ 469 out4 = in4; \ 470 STEP16_1(in6, in5, out5, out6, cospi16_v); \ 471 out7 = in7; \ 472 \ 473 out8 = vec_add(in8, inB); \ 474 out9 = vec_add(in9, inA); \ 475 outA = vec_sub(in9, inA); \ 476 outB = vec_sub(in8, inB); \ 477 outC = vec_sub(inF, inC); \ 478 outD = vec_sub(inE, inD); \ 479 outE = vec_add(inD, inE); \ 480 outF = vec_add(inC, inF); \ 481 \ 482 /* stage 6 */ \ 483 in0 = vec_add(out0, out7); \ 484 in1 = vec_add(out1, out6); \ 485 in2 = vec_add(out2, out5); \ 486 in3 = vec_add(out3, out4); \ 487 in4 = vec_sub(out3, out4); \ 488 in5 = vec_sub(out2, out5); \ 489 in6 = vec_sub(out1, out6); \ 490 in7 = vec_sub(out0, out7); \ 491 in8 = out8; \ 492 in9 = out9; \ 493 STEP16_1(outD, outA, inA, inD, cospi16_v); \ 494 STEP16_1(outC, outB, inB, inC, cospi16_v); \ 495 inE = outE; \ 496 inF = outF; \ 497 \ 498 /* stage 7 */ \ 499 out0 = vec_add(in0, inF); \ 500 out1 = vec_add(in1, inE); \ 501 out2 = vec_add(in2, inD); \ 502 out3 = vec_add(in3, inC); \ 503 out4 = vec_add(in4, inB); \ 504 out5 = vec_add(in5, inA); \ 505 out6 = vec_add(in6, in9); \ 506 out7 = vec_add(in7, in8); \ 507 out8 = vec_sub(in7, in8); \ 508 out9 = vec_sub(in6, in9); \ 509 outA = vec_sub(in5, inA); \ 510 outB = vec_sub(in4, inB); \ 511 outC = vec_sub(in3, inC); \ 512 outD = vec_sub(in2, inD); \ 513 outE = vec_sub(in1, inE); \ 514 outF = vec_sub(in0, inF); 515 516 #define PIXEL_ADD_STORE16(in0, in1, dst, offset) \ 517 d_uh = (int16x8_t)vec_mergeh(dst, zerov); \ 518 d_ul = (int16x8_t)vec_mergel(dst, zerov); \ 519 PIXEL_ADD(in0, d_uh, add, shift6); \ 520 PIXEL_ADD(in1, d_ul, add, shift6); \ 521 vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest); 522 523 void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, 524 int stride) { 525 int32x4_t temp10, temp11, temp20, temp21, temp30; 526 int16x8_t src00, src01, src02, src03, src04, src05, src06, src07, src10, 527 src11, src12, src13, src14, src15, src16, src17; 528 int16x8_t src20, src21, src22, src23, src24, src25, src26, src27, src30, 529 src31, src32, src33, src34, src35, src36, src37; 530 int16x8_t tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, 531 tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp16_0, tmp16_1; 532 int16x8_t tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, 533 tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37; 534 uint8x16_t dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7, dest8, 535 dest9, destA, destB, destC, destD, destE, destF; 536 int16x8_t d_uh, d_ul; 537 int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2)); 538 uint16x8_t shift6 = vec_splat_u16(6); 539 uint8x16_t zerov = vec_splat_u8(0); 540 ROUND_SHIFT_INIT; 541 542 // transform rows 543 // load and transform the upper half of 16x16 matrix 544 LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src00, src10, src01, 545 src11, src02, src12, src03, src13, src04, src14, src05, src15, 546 src06, src16, src07, src17); 547 TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00, 548 tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07); 549 TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10, 550 tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17); 551 IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, tmp11, 552 tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, src00, src01, src02, src03, 553 src04, src05, src06, src07, src10, src11, src12, src13, src14, src15, 554 src16, src17); 555 TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00, 556 tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07); 557 TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10, 558 tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17); 559 560 // load and transform the lower half of 16x16 matrix 561 LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input), 562 8 * sizeof(*input), src20, src30, src21, src31, src22, src32, 563 src23, src33, src24, src34, src25, src35, src26, src36, src27, 564 src37); 565 TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20, 566 tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27); 567 TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30, 568 tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37); 569 IDCT16(tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, tmp31, 570 tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src20, src21, src22, src23, 571 src24, src25, src26, src27, src30, src31, src32, src33, src34, src35, 572 src36, src37); 573 TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20, 574 tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27); 575 TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30, 576 tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37); 577 578 // transform columns 579 // left half first 580 IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp20, tmp21, 581 tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, src00, src01, src02, src03, 582 src04, src05, src06, src07, src20, src21, src22, src23, src24, src25, 583 src26, src27); 584 // right half 585 IDCT16(tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp30, tmp31, 586 tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src10, src11, src12, src13, 587 src14, src15, src16, src17, src30, src31, src32, src33, src34, src35, 588 src36, src37); 589 590 // load dest 591 LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, dest0, dest1, dest2, dest3, dest4, 592 dest5, dest6, dest7, dest8, dest9, destA, destB, destC, destD, 593 destE, destF); 594 595 PIXEL_ADD_STORE16(src00, src10, dest0, 0); 596 PIXEL_ADD_STORE16(src01, src11, dest1, stride); 597 PIXEL_ADD_STORE16(src02, src12, dest2, 2 * stride); 598 PIXEL_ADD_STORE16(src03, src13, dest3, 3 * stride); 599 PIXEL_ADD_STORE16(src04, src14, dest4, 4 * stride); 600 PIXEL_ADD_STORE16(src05, src15, dest5, 5 * stride); 601 PIXEL_ADD_STORE16(src06, src16, dest6, 6 * stride); 602 PIXEL_ADD_STORE16(src07, src17, dest7, 7 * stride); 603 604 PIXEL_ADD_STORE16(src20, src30, dest8, 8 * stride); 605 PIXEL_ADD_STORE16(src21, src31, dest9, 9 * stride); 606 PIXEL_ADD_STORE16(src22, src32, destA, 10 * stride); 607 PIXEL_ADD_STORE16(src23, src33, destB, 11 * stride); 608 PIXEL_ADD_STORE16(src24, src34, destC, 12 * stride); 609 PIXEL_ADD_STORE16(src25, src35, destD, 13 * stride); 610 PIXEL_ADD_STORE16(src26, src36, destE, 14 * stride); 611 PIXEL_ADD_STORE16(src27, src37, destF, 15 * stride); 612 } 613 614 #define LOAD_8x32(load, in00, in01, in02, in03, in10, in11, in12, in13, in20, \ 615 in21, in22, in23, in30, in31, in32, in33, in40, in41, in42, \ 616 in43, in50, in51, in52, in53, in60, in61, in62, in63, in70, \ 617 in71, in72, in73, offset) \ 618 /* load the first row from the 8x32 block*/ \ 619 in00 = load(offset, input); \ 620 in01 = load(offset + 16, input); \ 621 in02 = load(offset + 2 * 16, input); \ 622 in03 = load(offset + 3 * 16, input); \ 623 \ 624 in10 = load(offset + 4 * 16, input); \ 625 in11 = load(offset + 5 * 16, input); \ 626 in12 = load(offset + 6 * 16, input); \ 627 in13 = load(offset + 7 * 16, input); \ 628 \ 629 in20 = load(offset + 8 * 16, input); \ 630 in21 = load(offset + 9 * 16, input); \ 631 in22 = load(offset + 10 * 16, input); \ 632 in23 = load(offset + 11 * 16, input); \ 633 \ 634 in30 = load(offset + 12 * 16, input); \ 635 in31 = load(offset + 13 * 16, input); \ 636 in32 = load(offset + 14 * 16, input); \ 637 in33 = load(offset + 15 * 16, input); \ 638 \ 639 in40 = load(offset + 16 * 16, input); \ 640 in41 = load(offset + 17 * 16, input); \ 641 in42 = load(offset + 18 * 16, input); \ 642 in43 = load(offset + 19 * 16, input); \ 643 \ 644 in50 = load(offset + 20 * 16, input); \ 645 in51 = load(offset + 21 * 16, input); \ 646 in52 = load(offset + 22 * 16, input); \ 647 in53 = load(offset + 23 * 16, input); \ 648 \ 649 in60 = load(offset + 24 * 16, input); \ 650 in61 = load(offset + 25 * 16, input); \ 651 in62 = load(offset + 26 * 16, input); \ 652 in63 = load(offset + 27 * 16, input); \ 653 \ 654 /* load the last row from the 8x32 block*/ \ 655 in70 = load(offset + 28 * 16, input); \ 656 in71 = load(offset + 29 * 16, input); \ 657 in72 = load(offset + 30 * 16, input); \ 658 in73 = load(offset + 31 * 16, input); 659 660 /* for the: temp1 = -step[x] * cospi_q + step[y] * cospi_z 661 * temp2 = step[x] * cospi_z + step[y] * cospi_q */ 662 #define STEP32(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1) \ 663 tmp16_0 = vec_mergeh(inpt0, inpt1); \ 664 tmp16_1 = vec_mergel(inpt0, inpt1); \ 665 temp10 = vec_sub(vec_mulo(tmp16_0, cospi1), vec_mule(tmp16_0, cospi0)); \ 666 temp11 = vec_sub(vec_mulo(tmp16_1, cospi1), vec_mule(tmp16_1, cospi0)); \ 667 DCT_CONST_ROUND_SHIFT(temp10); \ 668 DCT_CONST_ROUND_SHIFT(temp11); \ 669 outpt0 = vec_packs(temp10, temp11); \ 670 temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \ 671 temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \ 672 DCT_CONST_ROUND_SHIFT(temp10); \ 673 DCT_CONST_ROUND_SHIFT(temp11); \ 674 outpt1 = vec_packs(temp10, temp11); 675 676 /* for the: temp1 = -step[x] * cospi_q - step[y] * cospi_z 677 * temp2 = -step[x] * cospi_z + step[y] * cospi_q */ 678 #define STEP32_1(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1, cospi1m) \ 679 tmp16_0 = vec_mergeh(inpt0, inpt1); \ 680 tmp16_1 = vec_mergel(inpt0, inpt1); \ 681 temp10 = vec_sub(vec_mulo(tmp16_0, cospi1m), vec_mule(tmp16_0, cospi0)); \ 682 temp11 = vec_sub(vec_mulo(tmp16_1, cospi1m), vec_mule(tmp16_1, cospi0)); \ 683 DCT_CONST_ROUND_SHIFT(temp10); \ 684 DCT_CONST_ROUND_SHIFT(temp11); \ 685 outpt0 = vec_packs(temp10, temp11); \ 686 temp10 = vec_sub(vec_mulo(tmp16_0, cospi0), vec_mule(tmp16_0, cospi1)); \ 687 temp11 = vec_sub(vec_mulo(tmp16_1, cospi0), vec_mule(tmp16_1, cospi1)); \ 688 DCT_CONST_ROUND_SHIFT(temp10); \ 689 DCT_CONST_ROUND_SHIFT(temp11); \ 690 outpt1 = vec_packs(temp10, temp11); 691 692 #define IDCT32(in0, in1, in2, in3, out) \ 693 \ 694 /* stage 1 */ \ 695 /* out[0][0] = in[0][0]; */ \ 696 out[0][1] = in2[0]; \ 697 out[0][2] = in1[0]; \ 698 out[0][3] = in3[0]; \ 699 out[0][4] = in0[4]; \ 700 out[0][5] = in2[4]; \ 701 out[0][6] = in1[4]; \ 702 out[0][7] = in3[4]; \ 703 out[1][0] = in0[2]; \ 704 out[1][1] = in2[2]; \ 705 out[1][2] = in1[2]; \ 706 out[1][3] = in3[2]; \ 707 out[1][4] = in0[6]; \ 708 out[1][5] = in2[6]; \ 709 out[1][6] = in1[6]; \ 710 out[1][7] = in3[6]; \ 711 \ 712 STEP8_0(in0[1], in3[7], out[2][0], out[3][7], cospi31_v, cospi1_v); \ 713 STEP8_0(in2[1], in1[7], out[2][1], out[3][6], cospi15_v, cospi17_v); \ 714 STEP8_0(in1[1], in2[7], out[2][2], out[3][5], cospi23_v, cospi9_v); \ 715 STEP8_0(in3[1], in0[7], out[2][3], out[3][4], cospi7_v, cospi25_v); \ 716 STEP8_0(in0[5], in3[3], out[2][4], out[3][3], cospi27_v, cospi5_v); \ 717 STEP8_0(in2[5], in1[3], out[2][5], out[3][2], cospi11_v, cospi21_v); \ 718 STEP8_0(in1[5], in2[3], out[2][6], out[3][1], cospi19_v, cospi13_v); \ 719 STEP8_0(in3[5], in0[3], out[2][7], out[3][0], cospi3_v, cospi29_v); \ 720 \ 721 /* stage 2 */ \ 722 /* in0[0] = out[0][0]; */ \ 723 in0[1] = out[0][1]; \ 724 in0[2] = out[0][2]; \ 725 in0[3] = out[0][3]; \ 726 in0[4] = out[0][4]; \ 727 in0[5] = out[0][5]; \ 728 in0[6] = out[0][6]; \ 729 in0[7] = out[0][7]; \ 730 \ 731 STEP8_0(out[1][0], out[1][7], in1[0], in1[7], cospi30_v, cospi2_v); \ 732 STEP8_0(out[1][1], out[1][6], in1[1], in1[6], cospi14_v, cospi18_v); \ 733 STEP8_0(out[1][2], out[1][5], in1[2], in1[5], cospi22_v, cospi10_v); \ 734 STEP8_0(out[1][3], out[1][4], in1[3], in1[4], cospi6_v, cospi26_v); \ 735 \ 736 in2[0] = vec_add(out[2][0], out[2][1]); \ 737 in2[1] = vec_sub(out[2][0], out[2][1]); \ 738 in2[2] = vec_sub(out[2][3], out[2][2]); \ 739 in2[3] = vec_add(out[2][3], out[2][2]); \ 740 in2[4] = vec_add(out[2][4], out[2][5]); \ 741 in2[5] = vec_sub(out[2][4], out[2][5]); \ 742 in2[6] = vec_sub(out[2][7], out[2][6]); \ 743 in2[7] = vec_add(out[2][7], out[2][6]); \ 744 in3[0] = vec_add(out[3][0], out[3][1]); \ 745 in3[1] = vec_sub(out[3][0], out[3][1]); \ 746 in3[2] = vec_sub(out[3][3], out[3][2]); \ 747 in3[3] = vec_add(out[3][3], out[3][2]); \ 748 in3[4] = vec_add(out[3][4], out[3][5]); \ 749 in3[5] = vec_sub(out[3][4], out[3][5]); \ 750 in3[6] = vec_sub(out[3][7], out[3][6]); \ 751 in3[7] = vec_add(out[3][6], out[3][7]); \ 752 \ 753 /* stage 3 */ \ 754 out[0][0] = in0[0]; \ 755 out[0][1] = in0[1]; \ 756 out[0][2] = in0[2]; \ 757 out[0][3] = in0[3]; \ 758 \ 759 STEP8_0(in0[4], in0[7], out[0][4], out[0][7], cospi28_v, cospi4_v); \ 760 STEP8_0(in0[5], in0[6], out[0][5], out[0][6], cospi12_v, cospi20_v); \ 761 \ 762 out[1][0] = vec_add(in1[0], in1[1]); \ 763 out[1][1] = vec_sub(in1[0], in1[1]); \ 764 out[1][2] = vec_sub(in1[3], in1[2]); \ 765 out[1][3] = vec_add(in1[2], in1[3]); \ 766 out[1][4] = vec_add(in1[4], in1[5]); \ 767 out[1][5] = vec_sub(in1[4], in1[5]); \ 768 out[1][6] = vec_sub(in1[7], in1[6]); \ 769 out[1][7] = vec_add(in1[6], in1[7]); \ 770 \ 771 out[2][0] = in2[0]; \ 772 out[3][7] = in3[7]; \ 773 STEP32(in2[1], in3[6], out[2][1], out[3][6], cospi4_v, cospi28_v); \ 774 STEP32_1(in2[2], in3[5], out[2][2], out[3][5], cospi28_v, cospi4_v, \ 775 cospi4m_v); \ 776 out[2][3] = in2[3]; \ 777 out[2][4] = in2[4]; \ 778 STEP32(in2[5], in3[2], out[2][5], out[3][2], cospi20_v, cospi12_v); \ 779 STEP32_1(in2[6], in3[1], out[2][6], out[3][1], cospi12_v, cospi20_v, \ 780 cospi20m_v); \ 781 out[2][7] = in2[7]; \ 782 out[3][0] = in3[0]; \ 783 out[3][3] = in3[3]; \ 784 out[3][4] = in3[4]; \ 785 \ 786 /* stage 4 */ \ 787 STEP16_1(out[0][0], out[0][1], in0[1], in0[0], cospi16_v); \ 788 STEP8_0(out[0][2], out[0][3], in0[2], in0[3], cospi24_v, cospi8_v); \ 789 in0[4] = vec_add(out[0][4], out[0][5]); \ 790 in0[5] = vec_sub(out[0][4], out[0][5]); \ 791 in0[6] = vec_sub(out[0][7], out[0][6]); \ 792 in0[7] = vec_add(out[0][7], out[0][6]); \ 793 \ 794 in1[0] = out[1][0]; \ 795 in1[7] = out[1][7]; \ 796 STEP32(out[1][1], out[1][6], in1[1], in1[6], cospi8_v, cospi24_v); \ 797 STEP32_1(out[1][2], out[1][5], in1[2], in1[5], cospi24_v, cospi8_v, \ 798 cospi8m_v); \ 799 in1[3] = out[1][3]; \ 800 in1[4] = out[1][4]; \ 801 \ 802 in2[0] = vec_add(out[2][0], out[2][3]); \ 803 in2[1] = vec_add(out[2][1], out[2][2]); \ 804 in2[2] = vec_sub(out[2][1], out[2][2]); \ 805 in2[3] = vec_sub(out[2][0], out[2][3]); \ 806 in2[4] = vec_sub(out[2][7], out[2][4]); \ 807 in2[5] = vec_sub(out[2][6], out[2][5]); \ 808 in2[6] = vec_add(out[2][5], out[2][6]); \ 809 in2[7] = vec_add(out[2][4], out[2][7]); \ 810 \ 811 in3[0] = vec_add(out[3][0], out[3][3]); \ 812 in3[1] = vec_add(out[3][1], out[3][2]); \ 813 in3[2] = vec_sub(out[3][1], out[3][2]); \ 814 in3[3] = vec_sub(out[3][0], out[3][3]); \ 815 in3[4] = vec_sub(out[3][7], out[3][4]); \ 816 in3[5] = vec_sub(out[3][6], out[3][5]); \ 817 in3[6] = vec_add(out[3][5], out[3][6]); \ 818 in3[7] = vec_add(out[3][4], out[3][7]); \ 819 \ 820 /* stage 5 */ \ 821 out[0][0] = vec_add(in0[0], in0[3]); \ 822 out[0][1] = vec_add(in0[1], in0[2]); \ 823 out[0][2] = vec_sub(in0[1], in0[2]); \ 824 out[0][3] = vec_sub(in0[0], in0[3]); \ 825 out[0][4] = in0[4]; \ 826 STEP16_1(in0[6], in0[5], out[0][5], out[0][6], cospi16_v); \ 827 out[0][7] = in0[7]; \ 828 \ 829 out[1][0] = vec_add(in1[0], in1[3]); \ 830 out[1][1] = vec_add(in1[1], in1[2]); \ 831 out[1][2] = vec_sub(in1[1], in1[2]); \ 832 out[1][3] = vec_sub(in1[0], in1[3]); \ 833 out[1][4] = vec_sub(in1[7], in1[4]); \ 834 out[1][5] = vec_sub(in1[6], in1[5]); \ 835 out[1][6] = vec_add(in1[5], in1[6]); \ 836 out[1][7] = vec_add(in1[4], in1[7]); \ 837 \ 838 out[2][0] = in2[0]; \ 839 out[2][1] = in2[1]; \ 840 STEP32(in2[2], in3[5], out[2][2], out[3][5], cospi8_v, cospi24_v); \ 841 STEP32(in2[3], in3[4], out[2][3], out[3][4], cospi8_v, cospi24_v); \ 842 STEP32_1(in2[4], in3[3], out[2][4], out[3][3], cospi24_v, cospi8_v, \ 843 cospi8m_v); \ 844 STEP32_1(in2[5], in3[2], out[2][5], out[3][2], cospi24_v, cospi8_v, \ 845 cospi8m_v); \ 846 out[2][6] = in2[6]; \ 847 out[2][7] = in2[7]; \ 848 out[3][0] = in3[0]; \ 849 out[3][1] = in3[1]; \ 850 out[3][6] = in3[6]; \ 851 out[3][7] = in3[7]; \ 852 \ 853 /* stage 6 */ \ 854 in0[0] = vec_add(out[0][0], out[0][7]); \ 855 in0[1] = vec_add(out[0][1], out[0][6]); \ 856 in0[2] = vec_add(out[0][2], out[0][5]); \ 857 in0[3] = vec_add(out[0][3], out[0][4]); \ 858 in0[4] = vec_sub(out[0][3], out[0][4]); \ 859 in0[5] = vec_sub(out[0][2], out[0][5]); \ 860 in0[6] = vec_sub(out[0][1], out[0][6]); \ 861 in0[7] = vec_sub(out[0][0], out[0][7]); \ 862 in1[0] = out[1][0]; \ 863 in1[1] = out[1][1]; \ 864 STEP16_1(out[1][5], out[1][2], in1[2], in1[5], cospi16_v); \ 865 STEP16_1(out[1][4], out[1][3], in1[3], in1[4], cospi16_v); \ 866 in1[6] = out[1][6]; \ 867 in1[7] = out[1][7]; \ 868 \ 869 in2[0] = vec_add(out[2][0], out[2][7]); \ 870 in2[1] = vec_add(out[2][1], out[2][6]); \ 871 in2[2] = vec_add(out[2][2], out[2][5]); \ 872 in2[3] = vec_add(out[2][3], out[2][4]); \ 873 in2[4] = vec_sub(out[2][3], out[2][4]); \ 874 in2[5] = vec_sub(out[2][2], out[2][5]); \ 875 in2[6] = vec_sub(out[2][1], out[2][6]); \ 876 in2[7] = vec_sub(out[2][0], out[2][7]); \ 877 \ 878 in3[0] = vec_sub(out[3][7], out[3][0]); \ 879 in3[1] = vec_sub(out[3][6], out[3][1]); \ 880 in3[2] = vec_sub(out[3][5], out[3][2]); \ 881 in3[3] = vec_sub(out[3][4], out[3][3]); \ 882 in3[4] = vec_add(out[3][4], out[3][3]); \ 883 in3[5] = vec_add(out[3][5], out[3][2]); \ 884 in3[6] = vec_add(out[3][6], out[3][1]); \ 885 in3[7] = vec_add(out[3][7], out[3][0]); \ 886 \ 887 /* stage 7 */ \ 888 out[0][0] = vec_add(in0[0], in1[7]); \ 889 out[0][1] = vec_add(in0[1], in1[6]); \ 890 out[0][2] = vec_add(in0[2], in1[5]); \ 891 out[0][3] = vec_add(in0[3], in1[4]); \ 892 out[0][4] = vec_add(in0[4], in1[3]); \ 893 out[0][5] = vec_add(in0[5], in1[2]); \ 894 out[0][6] = vec_add(in0[6], in1[1]); \ 895 out[0][7] = vec_add(in0[7], in1[0]); \ 896 out[1][0] = vec_sub(in0[7], in1[0]); \ 897 out[1][1] = vec_sub(in0[6], in1[1]); \ 898 out[1][2] = vec_sub(in0[5], in1[2]); \ 899 out[1][3] = vec_sub(in0[4], in1[3]); \ 900 out[1][4] = vec_sub(in0[3], in1[4]); \ 901 out[1][5] = vec_sub(in0[2], in1[5]); \ 902 out[1][6] = vec_sub(in0[1], in1[6]); \ 903 out[1][7] = vec_sub(in0[0], in1[7]); \ 904 \ 905 out[2][0] = in2[0]; \ 906 out[2][1] = in2[1]; \ 907 out[2][2] = in2[2]; \ 908 out[2][3] = in2[3]; \ 909 STEP16_1(in3[3], in2[4], out[2][4], out[3][3], cospi16_v); \ 910 STEP16_1(in3[2], in2[5], out[2][5], out[3][2], cospi16_v); \ 911 STEP16_1(in3[1], in2[6], out[2][6], out[3][1], cospi16_v); \ 912 STEP16_1(in3[0], in2[7], out[2][7], out[3][0], cospi16_v); \ 913 out[3][4] = in3[4]; \ 914 out[3][5] = in3[5]; \ 915 out[3][6] = in3[6]; \ 916 out[3][7] = in3[7]; \ 917 \ 918 /* final */ \ 919 in0[0] = vec_add(out[0][0], out[3][7]); \ 920 in0[1] = vec_add(out[0][1], out[3][6]); \ 921 in0[2] = vec_add(out[0][2], out[3][5]); \ 922 in0[3] = vec_add(out[0][3], out[3][4]); \ 923 in0[4] = vec_add(out[0][4], out[3][3]); \ 924 in0[5] = vec_add(out[0][5], out[3][2]); \ 925 in0[6] = vec_add(out[0][6], out[3][1]); \ 926 in0[7] = vec_add(out[0][7], out[3][0]); \ 927 in1[0] = vec_add(out[1][0], out[2][7]); \ 928 in1[1] = vec_add(out[1][1], out[2][6]); \ 929 in1[2] = vec_add(out[1][2], out[2][5]); \ 930 in1[3] = vec_add(out[1][3], out[2][4]); \ 931 in1[4] = vec_add(out[1][4], out[2][3]); \ 932 in1[5] = vec_add(out[1][5], out[2][2]); \ 933 in1[6] = vec_add(out[1][6], out[2][1]); \ 934 in1[7] = vec_add(out[1][7], out[2][0]); \ 935 in2[0] = vec_sub(out[1][7], out[2][0]); \ 936 in2[1] = vec_sub(out[1][6], out[2][1]); \ 937 in2[2] = vec_sub(out[1][5], out[2][2]); \ 938 in2[3] = vec_sub(out[1][4], out[2][3]); \ 939 in2[4] = vec_sub(out[1][3], out[2][4]); \ 940 in2[5] = vec_sub(out[1][2], out[2][5]); \ 941 in2[6] = vec_sub(out[1][1], out[2][6]); \ 942 in2[7] = vec_sub(out[1][0], out[2][7]); \ 943 in3[0] = vec_sub(out[0][7], out[3][0]); \ 944 in3[1] = vec_sub(out[0][6], out[3][1]); \ 945 in3[2] = vec_sub(out[0][5], out[3][2]); \ 946 in3[3] = vec_sub(out[0][4], out[3][3]); \ 947 in3[4] = vec_sub(out[0][3], out[3][4]); \ 948 in3[5] = vec_sub(out[0][2], out[3][5]); \ 949 in3[6] = vec_sub(out[0][1], out[3][6]); \ 950 in3[7] = vec_sub(out[0][0], out[3][7]); 951 952 // NOT A FULL TRANSPOSE! Transposes just each 8x8 block in each row, 953 // does not transpose rows 954 #define TRANSPOSE_8x32(in, out) \ 955 /* transpose 4 of 8x8 blocks */ \ 956 TRANSPOSE8x8(in[0][0], in[0][1], in[0][2], in[0][3], in[0][4], in[0][5], \ 957 in[0][6], in[0][7], out[0][0], out[0][1], out[0][2], out[0][3], \ 958 out[0][4], out[0][5], out[0][6], out[0][7]); \ 959 TRANSPOSE8x8(in[1][0], in[1][1], in[1][2], in[1][3], in[1][4], in[1][5], \ 960 in[1][6], in[1][7], out[1][0], out[1][1], out[1][2], out[1][3], \ 961 out[1][4], out[1][5], out[1][6], out[1][7]); \ 962 TRANSPOSE8x8(in[2][0], in[2][1], in[2][2], in[2][3], in[2][4], in[2][5], \ 963 in[2][6], in[2][7], out[2][0], out[2][1], out[2][2], out[2][3], \ 964 out[2][4], out[2][5], out[2][6], out[2][7]); \ 965 TRANSPOSE8x8(in[3][0], in[3][1], in[3][2], in[3][3], in[3][4], in[3][5], \ 966 in[3][6], in[3][7], out[3][0], out[3][1], out[3][2], out[3][3], \ 967 out[3][4], out[3][5], out[3][6], out[3][7]); 968 969 #define PIXEL_ADD_STORE32(in0, in1, in2, in3, step) \ 970 dst = vec_vsx_ld((step)*stride, dest); \ 971 d_uh = (int16x8_t)vec_mergeh(dst, zerov); \ 972 d_ul = (int16x8_t)vec_mergel(dst, zerov); \ 973 PIXEL_ADD(in0, d_uh, add, shift6); \ 974 PIXEL_ADD(in1, d_ul, add, shift6); \ 975 vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride, dest); \ 976 dst = vec_vsx_ld((step)*stride + 16, dest); \ 977 d_uh = (int16x8_t)vec_mergeh(dst, zerov); \ 978 d_ul = (int16x8_t)vec_mergel(dst, zerov); \ 979 PIXEL_ADD(in2, d_uh, add, shift6); \ 980 PIXEL_ADD(in3, d_ul, add, shift6); \ 981 vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride + 16, dest); 982 983 #define ADD_STORE_BLOCK(in, offset) \ 984 PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], offset + 0); \ 985 PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], offset + 1); \ 986 PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], offset + 2); \ 987 PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], offset + 3); \ 988 PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], offset + 4); \ 989 PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], offset + 5); \ 990 PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], offset + 6); \ 991 PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], offset + 7); 992 993 void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest, 994 int stride) { 995 int16x8_t src0[4][8], src1[4][8], src2[4][8], src3[4][8], tmp[4][8]; 996 int16x8_t tmp16_0, tmp16_1; 997 int32x4_t temp10, temp11, temp20, temp21, temp30; 998 uint8x16_t dst; 999 int16x8_t d_uh, d_ul; 1000 int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2)); 1001 uint16x8_t shift6 = vec_splat_u16(6); 1002 uint8x16_t zerov = vec_splat_u8(0); 1003 1004 ROUND_SHIFT_INIT; 1005 1006 LOAD_8x32(load_tran_low, src0[0][0], src0[1][0], src0[2][0], src0[3][0], 1007 src0[0][1], src0[1][1], src0[2][1], src0[3][1], src0[0][2], 1008 src0[1][2], src0[2][2], src0[3][2], src0[0][3], src0[1][3], 1009 src0[2][3], src0[3][3], src0[0][4], src0[1][4], src0[2][4], 1010 src0[3][4], src0[0][5], src0[1][5], src0[2][5], src0[3][5], 1011 src0[0][6], src0[1][6], src0[2][6], src0[3][6], src0[0][7], 1012 src0[1][7], src0[2][7], src0[3][7], 0); 1013 // Rows 1014 // transpose the first row of 8x8 blocks 1015 TRANSPOSE_8x32(src0, tmp); 1016 // transform the 32x8 column 1017 IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src0); 1018 TRANSPOSE_8x32(tmp, src0); 1019 1020 LOAD_8x32(load_tran_low, src1[0][0], src1[1][0], src1[2][0], src1[3][0], 1021 src1[0][1], src1[1][1], src1[2][1], src1[3][1], src1[0][2], 1022 src1[1][2], src1[2][2], src1[3][2], src1[0][3], src1[1][3], 1023 src1[2][3], src1[3][3], src1[0][4], src1[1][4], src1[2][4], 1024 src1[3][4], src1[0][5], src1[1][5], src1[2][5], src1[3][5], 1025 src1[0][6], src1[1][6], src1[2][6], src1[3][6], src1[0][7], 1026 src1[1][7], src1[2][7], src1[3][7], 512); 1027 TRANSPOSE_8x32(src1, tmp); 1028 IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src1); 1029 TRANSPOSE_8x32(tmp, src1); 1030 1031 LOAD_8x32(load_tran_low, src2[0][0], src2[1][0], src2[2][0], src2[3][0], 1032 src2[0][1], src2[1][1], src2[2][1], src2[3][1], src2[0][2], 1033 src2[1][2], src2[2][2], src2[3][2], src2[0][3], src2[1][3], 1034 src2[2][3], src2[3][3], src2[0][4], src2[1][4], src2[2][4], 1035 src2[3][4], src2[0][5], src2[1][5], src2[2][5], src2[3][5], 1036 src2[0][6], src2[1][6], src2[2][6], src2[3][6], src2[0][7], 1037 src2[1][7], src2[2][7], src2[3][7], 1024); 1038 TRANSPOSE_8x32(src2, tmp); 1039 IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src2); 1040 TRANSPOSE_8x32(tmp, src2); 1041 1042 LOAD_8x32(load_tran_low, src3[0][0], src3[1][0], src3[2][0], src3[3][0], 1043 src3[0][1], src3[1][1], src3[2][1], src3[3][1], src3[0][2], 1044 src3[1][2], src3[2][2], src3[3][2], src3[0][3], src3[1][3], 1045 src3[2][3], src3[3][3], src3[0][4], src3[1][4], src3[2][4], 1046 src3[3][4], src3[0][5], src3[1][5], src3[2][5], src3[3][5], 1047 src3[0][6], src3[1][6], src3[2][6], src3[3][6], src3[0][7], 1048 src3[1][7], src3[2][7], src3[3][7], 1536); 1049 TRANSPOSE_8x32(src3, tmp); 1050 IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src3); 1051 TRANSPOSE_8x32(tmp, src3); 1052 1053 // Columns 1054 IDCT32(src0[0], src1[0], src2[0], src3[0], tmp); 1055 IDCT32(src0[1], src1[1], src2[1], src3[1], tmp); 1056 IDCT32(src0[2], src1[2], src2[2], src3[2], tmp); 1057 IDCT32(src0[3], src1[3], src2[3], src3[3], tmp); 1058 1059 ADD_STORE_BLOCK(src0, 0); 1060 ADD_STORE_BLOCK(src1, 8); 1061 ADD_STORE_BLOCK(src2, 16); 1062 ADD_STORE_BLOCK(src3, 24); 1063 } 1064