1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 13 #include "vp9/common/vp9_enums.h" 14 #include "vp9/encoder/mips/msa/vp9_fdct_msa.h" 15 #include "vpx_dsp/mips/fwd_txfm_msa.h" 16 17 static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride, 18 const int32_t *const0, int16_t *int_buf) { 19 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; 20 v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3; 21 v4i32 k0, k1, k2, k3; 22 23 /* load input data */ 24 r0 = LD_SH(input); 25 r15 = LD_SH(input + 15 * stride); 26 r7 = LD_SH(input + 7 * stride); 27 r8 = LD_SH(input + 8 * stride); 28 SLLI_4V(r0, r15, r7, r8, 2); 29 30 /* stage 1 */ 31 LD_SW2(const0, 4, k0, k1); 32 LD_SW2(const0 + 8, 4, k2, k3); 33 MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); 34 35 r3 = LD_SH(input + 3 * stride); 36 r4 = LD_SH(input + 4 * stride); 37 r11 = LD_SH(input + 11 * stride); 38 r12 = LD_SH(input + 12 * stride); 39 SLLI_4V(r3, r4, r11, r12, 2); 40 41 LD_SW2(const0 + 4 * 4, 4, k0, k1); 42 LD_SW2(const0 + 4 * 6, 4, k2, k3); 43 MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); 44 45 /* stage 2 */ 46 BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1); 47 ST_SH2(tp0, tp2, int_buf, 8); 48 ST_SH2(tp1, tp3, int_buf + 4 * 8, 8); 49 50 LD_SW2(const0 + 4 * 8, 4, k0, k1); 51 k2 = LD_SW(const0 + 4 * 10); 52 MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); 53 54 ST_SH2(h0, h1, int_buf + 8 * 8, 8); 55 ST_SH2(h3, h2, int_buf + 12 * 8, 8); 56 57 r9 = LD_SH(input + 9 * stride); 58 r6 = LD_SH(input + 6 * stride); 59 r1 = LD_SH(input + stride); 60 r14 = LD_SH(input + 14 * stride); 61 SLLI_4V(r9, r6, r1, r14, 2); 62 63 LD_SW2(const0 + 4 * 11, 4, k0, k1); 64 LD_SW2(const0 + 4 * 13, 4, k2, k3); 65 MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3); 66 67 ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8); 68 69 r13 = LD_SH(input + 13 * stride); 70 r2 = LD_SH(input + 2 * stride); 71 r5 = LD_SH(input + 5 * stride); 72 r10 = LD_SH(input + 10 * stride); 73 SLLI_4V(r13, r2, r5, r10, 2); 74 75 LD_SW2(const0 + 4 * 15, 4, k0, k1); 76 LD_SW2(const0 + 4 * 17, 4, k2, k3); 77 MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3); 78 79 ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8); 80 81 BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3); 82 ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8); 83 } 84 85 static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0, 86 int16_t *out) { 87 int16_t *out_ptr = out + 128; 88 v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15; 89 v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11; 90 v8i16 out0, out1, out2, out3, out4, out5, out6, out7; 91 v8i16 out8, out9, out10, out11, out12, out13, out14, out15; 92 v4i32 k0, k1, k2, k3; 93 94 LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15); 95 LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7); 96 LD_SW2(const0 + 4 * 19, 4, k0, k1); 97 k2 = LD_SW(const0 + 4 * 21); 98 MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); 99 100 tp0 = LD_SH(int_buf + 4 * 8); 101 tp1 = LD_SH(int_buf + 5 * 8); 102 tp3 = LD_SH(int_buf + 10 * 8); 103 tp2 = LD_SH(int_buf + 14 * 8); 104 LD_SW2(const0 + 4 * 22, 4, k0, k1); 105 k2 = LD_SW(const0 + 4 * 24); 106 MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7); 107 out4 = -out4; 108 ST_SH(out4, (out + 3 * 16)); 109 ST_SH(out5, (out_ptr + 4 * 16)); 110 111 h1 = LD_SH(int_buf + 9 * 8); 112 h3 = LD_SH(int_buf + 12 * 8); 113 MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); 114 out13 = -out13; 115 ST_SH(out12, (out + 2 * 16)); 116 ST_SH(out13, (out_ptr + 5 * 16)); 117 118 tp0 = LD_SH(int_buf); 119 tp1 = LD_SH(int_buf + 8); 120 tp2 = LD_SH(int_buf + 2 * 8); 121 tp3 = LD_SH(int_buf + 6 * 8); 122 123 BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10); 124 out1 = -out1; 125 ST_SH(out0, (out)); 126 ST_SH(out1, (out_ptr + 7 * 16)); 127 128 h0 = LD_SH(int_buf + 8 * 8); 129 h2 = LD_SH(int_buf + 13 * 8); 130 131 BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); 132 out8 = -out8; 133 ST_SH(out8, (out + 16)); 134 ST_SH(out9, (out_ptr + 6 * 16)); 135 136 /* stage 4 */ 137 LD_SW2(const0 + 4 * 25, 4, k0, k1); 138 LD_SW2(const0 + 4 * 27, 4, k2, k3); 139 MADD_SHORT(h10, h11, k1, k2, out2, out3); 140 ST_SH(out2, (out + 7 * 16)); 141 ST_SH(out3, (out_ptr)); 142 143 MADD_SHORT(out6, out7, k0, k3, out6, out7); 144 ST_SH(out6, (out + 4 * 16)); 145 ST_SH(out7, (out_ptr + 3 * 16)); 146 147 MADD_SHORT(out10, out11, k0, k3, out10, out11); 148 ST_SH(out10, (out + 6 * 16)); 149 ST_SH(out11, (out_ptr + 16)); 150 151 MADD_SHORT(out14, out15, k1, k2, out14, out15); 152 ST_SH(out14, (out + 5 * 16)); 153 ST_SH(out15, (out_ptr + 2 * 16)); 154 } 155 156 static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) { 157 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; 158 v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; 159 160 /* load input data */ 161 LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); 162 TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, 163 r7); 164 FDCT_POSTPROC_2V_NEG_H(r0, r1); 165 FDCT_POSTPROC_2V_NEG_H(r2, r3); 166 FDCT_POSTPROC_2V_NEG_H(r4, r5); 167 FDCT_POSTPROC_2V_NEG_H(r6, r7); 168 ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); 169 out += 64; 170 171 LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); 172 TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, 173 r12, r13, r14, r15); 174 FDCT_POSTPROC_2V_NEG_H(r8, r9); 175 FDCT_POSTPROC_2V_NEG_H(r10, r11); 176 FDCT_POSTPROC_2V_NEG_H(r12, r13); 177 FDCT_POSTPROC_2V_NEG_H(r14, r15); 178 ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); 179 out += 64; 180 181 /* load input data */ 182 input += 128; 183 LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7); 184 TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, 185 r7); 186 FDCT_POSTPROC_2V_NEG_H(r0, r1); 187 FDCT_POSTPROC_2V_NEG_H(r2, r3); 188 FDCT_POSTPROC_2V_NEG_H(r4, r5); 189 FDCT_POSTPROC_2V_NEG_H(r6, r7); 190 ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8); 191 out += 64; 192 193 LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15); 194 TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, 195 r12, r13, r14, r15); 196 FDCT_POSTPROC_2V_NEG_H(r8, r9); 197 FDCT_POSTPROC_2V_NEG_H(r10, r11); 198 FDCT_POSTPROC_2V_NEG_H(r12, r13); 199 FDCT_POSTPROC_2V_NEG_H(r14, r15); 200 ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8); 201 } 202 203 static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0, 204 int16_t *int_buf) { 205 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; 206 v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3; 207 v4i32 k0, k1, k2, k3; 208 209 /* load input data */ 210 r0 = LD_SH(input); 211 r7 = LD_SH(input + 7 * 8); 212 r8 = LD_SH(input + 8 * 8); 213 r15 = LD_SH(input + 15 * 8); 214 215 /* stage 1 */ 216 LD_SW2(const0, 4, k0, k1); 217 LD_SW2(const0 + 4 * 2, 4, k2, k3); 218 MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); 219 220 r3 = LD_SH(input + 3 * 8); 221 r4 = LD_SH(input + 4 * 8); 222 r11 = LD_SH(input + 11 * 8); 223 r12 = LD_SH(input + 12 * 8); 224 225 LD_SW2(const0 + 4 * 4, 4, k0, k1); 226 LD_SW2(const0 + 4 * 6, 4, k2, k3); 227 MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); 228 229 /* stage 2 */ 230 BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1); 231 ST_SH2(tp0, tp1, int_buf, 4 * 8); 232 ST_SH2(tp2, tp3, int_buf + 8, 4 * 8); 233 234 LD_SW2(const0 + 4 * 8, 4, k0, k1); 235 k2 = LD_SW(const0 + 4 * 10); 236 MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); 237 ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8); 238 ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8); 239 240 r1 = LD_SH(input + 8); 241 r6 = LD_SH(input + 6 * 8); 242 r9 = LD_SH(input + 9 * 8); 243 r14 = LD_SH(input + 14 * 8); 244 245 LD_SW2(const0 + 4 * 11, 4, k0, k1); 246 LD_SW2(const0 + 4 * 13, 4, k2, k3); 247 MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3); 248 ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8); 249 250 r2 = LD_SH(input + 2 * 8); 251 r5 = LD_SH(input + 5 * 8); 252 r10 = LD_SH(input + 10 * 8); 253 r13 = LD_SH(input + 13 * 8); 254 255 LD_SW2(const0 + 4 * 15, 4, k0, k1); 256 LD_SW2(const0 + 4 * 17, 4, k2, k3); 257 MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3); 258 ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8); 259 BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3); 260 ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8); 261 } 262 263 static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0, 264 int16_t *out) { 265 int16_t *out_ptr = out + 8; 266 v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15; 267 v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11; 268 v8i16 out0, out1, out2, out3, out4, out5, out6, out7; 269 v8i16 out8, out9, out10, out11, out12, out13, out14, out15; 270 v4i32 k0, k1, k2, k3; 271 272 g13 = LD_SH(int_buf + 3 * 8); 273 g15 = LD_SH(int_buf + 7 * 8); 274 g5 = LD_SH(int_buf + 11 * 8); 275 g7 = LD_SH(int_buf + 15 * 8); 276 277 LD_SW2(const0 + 4 * 19, 4, k0, k1); 278 k2 = LD_SW(const0 + 4 * 21); 279 MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); 280 281 tp0 = LD_SH(int_buf + 4 * 8); 282 tp1 = LD_SH(int_buf + 5 * 8); 283 tp3 = LD_SH(int_buf + 10 * 8); 284 tp2 = LD_SH(int_buf + 14 * 8); 285 286 LD_SW2(const0 + 4 * 22, 4, k0, k1); 287 k2 = LD_SW(const0 + 4 * 24); 288 MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7); 289 out4 = -out4; 290 ST_SH(out4, (out + 3 * 16)); 291 ST_SH(out5, (out_ptr + 4 * 16)); 292 293 h1 = LD_SH(int_buf + 9 * 8); 294 h3 = LD_SH(int_buf + 12 * 8); 295 MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); 296 out13 = -out13; 297 ST_SH(out12, (out + 2 * 16)); 298 ST_SH(out13, (out_ptr + 5 * 16)); 299 300 tp0 = LD_SH(int_buf); 301 tp1 = LD_SH(int_buf + 8); 302 tp2 = LD_SH(int_buf + 2 * 8); 303 tp3 = LD_SH(int_buf + 6 * 8); 304 305 BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10); 306 out1 = -out1; 307 ST_SH(out0, (out)); 308 ST_SH(out1, (out_ptr + 7 * 16)); 309 310 h0 = LD_SH(int_buf + 8 * 8); 311 h2 = LD_SH(int_buf + 13 * 8); 312 BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); 313 out8 = -out8; 314 ST_SH(out8, (out + 16)); 315 ST_SH(out9, (out_ptr + 6 * 16)); 316 317 /* stage 4 */ 318 LD_SW2(const0 + 4 * 25, 4, k0, k1); 319 LD_SW2(const0 + 4 * 27, 4, k2, k3); 320 MADD_SHORT(h10, h11, k1, k2, out2, out3); 321 ST_SH(out2, (out + 7 * 16)); 322 ST_SH(out3, (out_ptr)); 323 324 MADD_SHORT(out6, out7, k0, k3, out6, out7); 325 ST_SH(out6, (out + 4 * 16)); 326 ST_SH(out7, (out_ptr + 3 * 16)); 327 328 MADD_SHORT(out10, out11, k0, k3, out10, out11); 329 ST_SH(out10, (out + 6 * 16)); 330 ST_SH(out11, (out_ptr + 16)); 331 332 MADD_SHORT(out14, out15, k1, k2, out14, out15); 333 ST_SH(out14, (out + 5 * 16)); 334 ST_SH(out15, (out_ptr + 2 * 16)); 335 } 336 337 static void fadst16_transpose_msa(int16_t *input, int16_t *out) { 338 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; 339 v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; 340 341 /* load input data */ 342 LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, 343 l7, l15); 344 TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, 345 r7); 346 TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, 347 r12, r13, r14, r15); 348 ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8); 349 ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8); 350 out += 16 * 8; 351 352 /* load input data */ 353 input += 128; 354 LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, 355 l7, l15); 356 TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6, 357 r7); 358 TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11, 359 r12, r13, r14, r15); 360 ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8); 361 ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8); 362 } 363 364 static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) { 365 int16_t *temp = intermediate; 366 int16_t *out = output; 367 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 368 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11; 369 v8i16 in12, in13, in14, in15; 370 371 LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7); 372 temp = intermediate + 8; 373 LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); 374 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 375 in4, in5, in6, in7); 376 TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 377 in10, in11, in12, in13, in14, in15); 378 FDCT_POSTPROC_2V_NEG_H(in0, in1); 379 FDCT_POSTPROC_2V_NEG_H(in2, in3); 380 FDCT_POSTPROC_2V_NEG_H(in4, in5); 381 FDCT_POSTPROC_2V_NEG_H(in6, in7); 382 FDCT_POSTPROC_2V_NEG_H(in8, in9); 383 FDCT_POSTPROC_2V_NEG_H(in10, in11); 384 FDCT_POSTPROC_2V_NEG_H(in12, in13); 385 FDCT_POSTPROC_2V_NEG_H(in14, in15); 386 BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, 387 in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, 388 tmp7, in8, in9, in10, in11, in12, in13, in14, in15); 389 temp = intermediate; 390 ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16); 391 FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, 392 tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 393 temp = intermediate; 394 LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); 395 FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, 396 in4, in5, in6, in7); 397 TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, 398 tmp1, in1, tmp2, in2, tmp3, in3); 399 ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16); 400 TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, 401 tmp5, in5, tmp6, in6, tmp7, in7); 402 out = output + 8; 403 ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16); 404 } 405 406 void vp9_fht16x16_msa(const int16_t *input, int16_t *output, int32_t stride, 407 int32_t tx_type) { 408 DECLARE_ALIGNED(32, int16_t, tmp[256]); 409 DECLARE_ALIGNED(32, int16_t, trans_buf[256]); 410 DECLARE_ALIGNED(32, int16_t, tmp_buf[128]); 411 int32_t i; 412 int16_t *ptmpbuf = &tmp_buf[0]; 413 int16_t *trans = &trans_buf[0]; 414 const int32_t const_arr[29 * 4] = { 415 52707308, 52707308, 52707308, 52707308, -1072430300, 416 -1072430300, -1072430300, -1072430300, 795618043, 795618043, 417 795618043, 795618043, -721080468, -721080468, -721080468, 418 -721080468, 459094491, 459094491, 459094491, 459094491, 419 -970646691, -970646691, -970646691, -970646691, 1010963856, 420 1010963856, 1010963856, 1010963856, -361743294, -361743294, 421 -361743294, -361743294, 209469125, 209469125, 209469125, 422 209469125, -1053094788, -1053094788, -1053094788, -1053094788, 423 1053160324, 1053160324, 1053160324, 1053160324, 639644520, 424 639644520, 639644520, 639644520, -862444000, -862444000, 425 -862444000, -862444000, 1062144356, 1062144356, 1062144356, 426 1062144356, -157532337, -157532337, -157532337, -157532337, 427 260914709, 260914709, 260914709, 260914709, -1041559667, 428 -1041559667, -1041559667, -1041559667, 920985831, 920985831, 429 920985831, 920985831, -551995675, -551995675, -551995675, 430 -551995675, 596522295, 596522295, 596522295, 596522295, 431 892853362, 892853362, 892853362, 892853362, -892787826, 432 -892787826, -892787826, -892787826, 410925857, 410925857, 433 410925857, 410925857, -992012162, -992012162, -992012162, 434 -992012162, 992077698, 992077698, 992077698, 992077698, 435 759246145, 759246145, 759246145, 759246145, -759180609, 436 -759180609, -759180609, -759180609, -759222975, -759222975, 437 -759222975, -759222975, 759288511, 759288511, 759288511, 438 759288511 439 }; 440 441 switch (tx_type) { 442 case DCT_DCT: 443 /* column transform */ 444 for (i = 0; i < 2; ++i) { 445 fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride); 446 } 447 448 /* row transform */ 449 for (i = 0; i < 2; ++i) { 450 fdct16x8_1d_row(tmp + (128 * i), output + (128 * i)); 451 } 452 break; 453 case ADST_DCT: 454 /* column transform */ 455 for (i = 0; i < 2; ++i) { 456 fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf); 457 fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3)); 458 } 459 460 /* row transform */ 461 for (i = 0; i < 2; ++i) { 462 postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i)); 463 } 464 break; 465 case DCT_ADST: 466 /* column transform */ 467 for (i = 0; i < 2; ++i) { 468 fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride); 469 } 470 471 fadst16_transpose_postproc_msa(tmp, trans); 472 473 /* row transform */ 474 for (i = 0; i < 2; ++i) { 475 fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf); 476 fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7)); 477 } 478 479 fadst16_transpose_msa(tmp, output); 480 break; 481 case ADST_ADST: 482 /* column transform */ 483 for (i = 0; i < 2; ++i) { 484 fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf); 485 fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3)); 486 } 487 488 fadst16_transpose_postproc_msa(tmp, trans); 489 490 /* row transform */ 491 for (i = 0; i < 2; ++i) { 492 fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf); 493 fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7)); 494 } 495 496 fadst16_transpose_msa(tmp, output); 497 break; 498 default: assert(0); break; 499 } 500 } 501