1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "vpx_dsp/mips/fwd_txfm_msa.h" 12 13 static void fdct8x32_1d_column_load_butterfly(const int16_t *input, 14 int32_t src_stride, 15 int16_t *temp_buff) { 16 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 17 v8i16 step0, step1, step2, step3; 18 v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; 19 v8i16 step0_1, step1_1, step2_1, step3_1; 20 21 /* 1st and 2nd set */ 22 LD_SH4(input, src_stride, in0, in1, in2, in3); 23 LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7); 24 LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); 25 LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); 26 SLLI_4V(in0, in1, in2, in3, 2); 27 SLLI_4V(in4, in5, in6, in7, 2); 28 SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); 29 SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); 30 BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, 31 step3, in4, in5, in6, in7); 32 BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, 33 step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); 34 ST_SH4(step0, step1, step2, step3, temp_buff, 8); 35 ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8); 36 ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8); 37 ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8); 38 39 /* 3rd and 4th set */ 40 LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3); 41 LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7); 42 LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); 43 LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); 44 SLLI_4V(in0, in1, in2, in3, 2); 45 SLLI_4V(in4, in5, in6, in7, 2); 46 SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); 47 SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); 48 BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, 49 step3, in4, in5, in6, in7); 50 BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, 51 step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); 52 ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8); 53 ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8); 54 ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8); 55 ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8); 56 } 57 58 static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { 59 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 60 v8i16 in8, in9, in10, in11, in12, in13, in14, in15; 61 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 62 v8i16 temp0, temp1; 63 64 /* fdct even */ 65 LD_SH4(input, 8, in0, in1, in2, in3); 66 LD_SH4(input + 96, 8, in12, in13, in14, in15); 67 BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2, 68 vec3, in12, in13, in14, in15); 69 LD_SH4(input + 32, 8, in4, in5, in6, in7); 70 LD_SH4(input + 64, 8, in8, in9, in10, in11); 71 BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7, 72 in8, in9, in10, in11); 73 74 /* Stage 3 */ 75 ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); 76 BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); 77 DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); 78 FDCT32_POSTPROC_2V_POS_H(temp0, temp1); 79 ST_SH(temp0, temp); 80 ST_SH(temp1, temp + 512); 81 82 DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); 83 FDCT32_POSTPROC_2V_POS_H(temp0, temp1); 84 ST_SH(temp0, temp + 256); 85 ST_SH(temp1, temp + 768); 86 87 SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4); 88 DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); 89 ADD2(vec4, vec5, vec7, vec6, vec0, vec1); 90 DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); 91 FDCT32_POSTPROC_2V_POS_H(temp0, temp1); 92 ST_SH(temp0, temp + 128); 93 ST_SH(temp1, temp + 896); 94 95 SUB2(vec4, vec5, vec7, vec6, vec4, vec7); 96 DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); 97 FDCT32_POSTPROC_2V_POS_H(temp0, temp1); 98 ST_SH(temp0, temp + 640); 99 ST_SH(temp1, temp + 384); 100 101 DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); 102 DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); 103 ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); 104 DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); 105 ADD2(in0, in1, in2, in3, vec0, vec7); 106 DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); 107 FDCT32_POSTPROC_2V_POS_H(temp0, temp1); 108 ST_SH(temp0, temp + 64); 109 ST_SH(temp1, temp + 960); 110 111 SUB2(in0, in1, in2, in3, in0, in2); 112 DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); 113 FDCT32_POSTPROC_2V_POS_H(temp0, temp1); 114 ST_SH(temp0, temp + 576); 115 ST_SH(temp1, temp + 448); 116 117 SUB2(in9, vec2, in14, vec5, vec2, vec5); 118 DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); 119 SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); 120 DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); 121 FDCT32_POSTPROC_2V_POS_H(temp0, temp1); 122 ST_SH(temp0, temp + 320); 123 ST_SH(temp1, temp + 704); 124 125 ADD2(in3, in2, in0, in1, vec3, vec4); 126 DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); 127 FDCT32_POSTPROC_2V_POS_H(temp0, temp1); 128 ST_SH(temp0, temp + 192); 129 ST_SH(temp1, temp + 832); 130 } 131 132 static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) { 133 v8i16 in16, in17, in18, in19, in20, in21, in22, in23; 134 v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; 135 136 in20 = LD_SH(input + 32); 137 in21 = LD_SH(input + 40); 138 in26 = LD_SH(input + 80); 139 in27 = LD_SH(input + 88); 140 141 DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); 142 DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); 143 144 in18 = LD_SH(input + 16); 145 in19 = LD_SH(input + 24); 146 in28 = LD_SH(input + 96); 147 in29 = LD_SH(input + 104); 148 149 vec4 = in19 - in20; 150 ST_SH(vec4, input + 32); 151 vec4 = in18 - in21; 152 ST_SH(vec4, input + 40); 153 vec4 = in29 - in26; 154 ST_SH(vec4, input + 80); 155 vec4 = in28 - in27; 156 ST_SH(vec4, input + 88); 157 158 in21 = in18 + in21; 159 in20 = in19 + in20; 160 in27 = in28 + in27; 161 in26 = in29 + in26; 162 163 LD_SH4(input + 48, 8, in22, in23, in24, in25); 164 DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); 165 DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); 166 167 in16 = LD_SH(input); 168 in17 = LD_SH(input + 8); 169 in30 = LD_SH(input + 112); 170 in31 = LD_SH(input + 120); 171 172 vec4 = in17 - in22; 173 ST_SH(vec4, input + 16); 174 vec4 = in16 - in23; 175 ST_SH(vec4, input + 24); 176 vec4 = in31 - in24; 177 ST_SH(vec4, input + 96); 178 vec4 = in30 - in25; 179 ST_SH(vec4, input + 104); 180 181 ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); 182 DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); 183 DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); 184 ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); 185 DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); 186 ADD2(in27, in26, in25, in24, in23, in20); 187 DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); 188 FDCT32_POSTPROC_2V_POS_H(vec5, vec4); 189 ST_SH(vec5, temp_ptr); 190 ST_SH(vec4, temp_ptr + 960); 191 192 SUB2(in27, in26, in25, in24, in22, in21); 193 DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); 194 FDCT32_POSTPROC_2V_POS_H(vec5, vec4); 195 ST_SH(vec5, temp_ptr + 448); 196 ST_SH(vec4, temp_ptr + 512); 197 198 SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); 199 DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); 200 SUB2(in26, in27, in24, in25, in23, in20); 201 DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); 202 FDCT32_POSTPROC_2V_POS_H(vec5, vec4); 203 ST_SH(vec4, temp_ptr + 704); 204 ST_SH(vec5, temp_ptr + 256); 205 206 ADD2(in26, in27, in24, in25, in22, in21); 207 DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); 208 FDCT32_POSTPROC_2V_POS_H(vec5, vec4); 209 ST_SH(vec4, temp_ptr + 192); 210 ST_SH(vec5, temp_ptr + 768); 211 212 LD_SH4(input + 16, 8, in22, in23, in20, in21); 213 LD_SH4(input + 80, 8, in26, in27, in24, in25); 214 in16 = in20; 215 in17 = in21; 216 DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); 217 DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); 218 SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); 219 DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); 220 ADD2(in28, in29, in31, in30, in16, in19); 221 DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); 222 FDCT32_POSTPROC_2V_POS_H(vec5, vec4); 223 ST_SH(vec5, temp_ptr + 832); 224 ST_SH(vec4, temp_ptr + 128); 225 226 SUB2(in28, in29, in31, in30, in17, in18); 227 DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); 228 FDCT32_POSTPROC_2V_POS_H(vec5, vec4); 229 ST_SH(vec5, temp_ptr + 320); 230 ST_SH(vec4, temp_ptr + 640); 231 ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); 232 DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); 233 SUB2(in29, in28, in30, in31, in16, in19); 234 DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); 235 FDCT32_POSTPROC_2V_POS_H(vec5, vec4); 236 ST_SH(vec5, temp_ptr + 576); 237 ST_SH(vec4, temp_ptr + 384); 238 239 ADD2(in29, in28, in30, in31, in17, in18); 240 DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); 241 FDCT32_POSTPROC_2V_POS_H(vec5, vec4); 242 ST_SH(vec5, temp_ptr + 64); 243 ST_SH(vec4, temp_ptr + 896); 244 } 245 246 static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride, 247 int16_t *tmp_buf, int16_t *tmp_buf_big) { 248 fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf); 249 fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big); 250 fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32)); 251 } 252 253 static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff, 254 int16_t *output) { 255 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 256 v8i16 in8, in9, in10, in11, in12, in13, in14, in15; 257 v8i16 step0, step1, step2, step3, step4, step5, step6, step7; 258 259 LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7); 260 LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15); 261 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 262 in4, in5, in6, in7); 263 TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 264 in10, in11, in12, in13, in14, in15); 265 BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, 266 in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, 267 step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); 268 ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8); 269 ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8); 270 271 /* 2nd set */ 272 LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7); 273 LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15); 274 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 275 in4, in5, in6, in7); 276 TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 277 in10, in11, in12, in13, in14, in15); 278 BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, 279 in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, 280 step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); 281 ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, 282 (output + 8 * 8), 8); 283 ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8); 284 } 285 286 static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, 287 int16_t *out) { 288 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 289 v8i16 in8, in9, in10, in11, in12, in13, in14, in15; 290 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 291 v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; 292 v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; 293 v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w; 294 295 /* fdct32 even */ 296 /* stage 2 */ 297 LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); 298 LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); 299 300 BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, 301 in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, 302 vec7, in8, in9, in10, in11, in12, in13, in14, in15); 303 ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); 304 ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); 305 306 /* Stage 3 */ 307 UNPCK_SH_SW(vec0, vec0_l, vec0_r); 308 UNPCK_SH_SW(vec1, vec1_l, vec1_r); 309 UNPCK_SH_SW(vec2, vec2_l, vec2_r); 310 UNPCK_SH_SW(vec3, vec3_l, vec3_r); 311 UNPCK_SH_SW(vec4, vec4_l, vec4_r); 312 UNPCK_SH_SW(vec5, vec5_l, vec5_r); 313 UNPCK_SH_SW(vec6, vec6_l, vec6_r); 314 UNPCK_SH_SW(vec7, vec7_l, vec7_r); 315 ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w, 316 tmp1_w, tmp2_w, tmp3_w); 317 BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); 318 ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r, 319 vec1_r, vec2_r, vec3_r); 320 321 tmp3_w = vec0_r + vec3_r; 322 vec0_r = vec0_r - vec3_r; 323 vec3_r = vec1_r + vec2_r; 324 vec1_r = vec1_r - vec2_r; 325 326 DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, 327 vec4_r, tmp3_w, vec6_r, vec3_r); 328 FDCT32_POSTPROC_NEG_W(vec4_r); 329 FDCT32_POSTPROC_NEG_W(tmp3_w); 330 FDCT32_POSTPROC_NEG_W(vec6_r); 331 FDCT32_POSTPROC_NEG_W(vec3_r); 332 PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); 333 ST_SH2(vec5, vec4, out, 8); 334 335 DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, 336 vec4_r, tmp3_w, vec6_r, vec3_r); 337 FDCT32_POSTPROC_NEG_W(vec4_r); 338 FDCT32_POSTPROC_NEG_W(tmp3_w); 339 FDCT32_POSTPROC_NEG_W(vec6_r); 340 FDCT32_POSTPROC_NEG_W(vec3_r); 341 PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); 342 ST_SH2(vec5, vec4, out + 16, 8); 343 344 LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); 345 SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); 346 DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); 347 ADD2(vec4, vec5, vec7, vec6, vec0, vec1); 348 DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); 349 FDCT_POSTPROC_2V_NEG_H(in4, in5); 350 ST_SH(in4, out + 32); 351 ST_SH(in5, out + 56); 352 353 SUB2(vec4, vec5, vec7, vec6, vec4, vec7); 354 DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); 355 FDCT_POSTPROC_2V_NEG_H(in4, in5); 356 ST_SH(in4, out + 40); 357 ST_SH(in5, out + 48); 358 359 LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); 360 DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); 361 DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); 362 ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); 363 DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); 364 ADD2(in0, in1, in2, in3, vec0, vec7); 365 DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); 366 FDCT_POSTPROC_2V_NEG_H(in4, in5); 367 ST_SH(in4, out + 64); 368 ST_SH(in5, out + 120); 369 370 SUB2(in0, in1, in2, in3, in0, in2); 371 DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); 372 FDCT_POSTPROC_2V_NEG_H(in4, in5); 373 ST_SH(in4, out + 72); 374 ST_SH(in5, out + 112); 375 376 SUB2(in9, vec2, in14, vec5, vec2, vec5); 377 DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); 378 SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); 379 DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); 380 FDCT_POSTPROC_2V_NEG_H(in4, in5); 381 ST_SH(in4, out + 80); 382 ST_SH(in5, out + 104); 383 384 ADD2(in3, in2, in0, in1, vec3, vec4); 385 DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); 386 FDCT_POSTPROC_2V_NEG_H(in4, in5); 387 ST_SH(in4, out + 96); 388 ST_SH(in5, out + 88); 389 } 390 391 static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) { 392 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 393 v8i16 in8, in9, in10, in11, in12, in13, in14, in15; 394 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; 395 396 /* fdct32 even */ 397 /* stage 2 */ 398 LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); 399 LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); 400 401 BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, 402 in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, 403 vec7, in8, in9, in10, in11, in12, in13, in14, in15); 404 405 /* Stage 3 */ 406 ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); 407 BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); 408 DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); 409 FDCT_POSTPROC_2V_NEG_H(temp0, temp1); 410 ST_SH(temp0, out); 411 ST_SH(temp1, out + 8); 412 413 DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); 414 FDCT_POSTPROC_2V_NEG_H(temp0, temp1); 415 ST_SH(temp0, out + 16); 416 ST_SH(temp1, out + 24); 417 418 SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); 419 DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); 420 ADD2(vec4, vec5, vec7, vec6, vec0, vec1); 421 DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); 422 FDCT_POSTPROC_2V_NEG_H(temp0, temp1); 423 ST_SH(temp0, out + 32); 424 ST_SH(temp1, out + 56); 425 426 SUB2(vec4, vec5, vec7, vec6, vec4, vec7); 427 DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); 428 FDCT_POSTPROC_2V_NEG_H(temp0, temp1); 429 ST_SH(temp0, out + 40); 430 ST_SH(temp1, out + 48); 431 432 DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); 433 DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); 434 ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); 435 DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); 436 ADD2(in0, in1, in2, in3, vec0, vec7); 437 DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); 438 FDCT_POSTPROC_2V_NEG_H(temp0, temp1); 439 ST_SH(temp0, out + 64); 440 ST_SH(temp1, out + 120); 441 442 SUB2(in0, in1, in2, in3, in0, in2); 443 DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); 444 FDCT_POSTPROC_2V_NEG_H(temp0, temp1); 445 ST_SH(temp0, out + 72); 446 ST_SH(temp1, out + 112); 447 448 SUB2(in9, vec2, in14, vec5, vec2, vec5); 449 DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); 450 SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5) 451 DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); 452 FDCT_POSTPROC_2V_NEG_H(temp0, temp1); 453 ST_SH(temp0, out + 80); 454 ST_SH(temp1, out + 104); 455 456 ADD2(in3, in2, in0, in1, vec3, vec4); 457 DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); 458 FDCT_POSTPROC_2V_NEG_H(temp0, temp1); 459 ST_SH(temp0, out + 96); 460 ST_SH(temp1, out + 88); 461 } 462 463 static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr, 464 int16_t *out) { 465 v8i16 in16, in17, in18, in19, in20, in21, in22, in23; 466 v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; 467 468 in20 = LD_SH(temp + 32); 469 in21 = LD_SH(temp + 40); 470 in26 = LD_SH(temp + 80); 471 in27 = LD_SH(temp + 88); 472 473 DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); 474 DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); 475 476 in18 = LD_SH(temp + 16); 477 in19 = LD_SH(temp + 24); 478 in28 = LD_SH(temp + 96); 479 in29 = LD_SH(temp + 104); 480 481 vec4 = in19 - in20; 482 ST_SH(vec4, interm_ptr + 32); 483 vec4 = in18 - in21; 484 ST_SH(vec4, interm_ptr + 88); 485 vec4 = in28 - in27; 486 ST_SH(vec4, interm_ptr + 56); 487 vec4 = in29 - in26; 488 ST_SH(vec4, interm_ptr + 64); 489 490 ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); 491 492 in22 = LD_SH(temp + 48); 493 in23 = LD_SH(temp + 56); 494 in24 = LD_SH(temp + 64); 495 in25 = LD_SH(temp + 72); 496 497 DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); 498 DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); 499 500 in16 = LD_SH(temp); 501 in17 = LD_SH(temp + 8); 502 in30 = LD_SH(temp + 112); 503 in31 = LD_SH(temp + 120); 504 505 vec4 = in17 - in22; 506 ST_SH(vec4, interm_ptr + 40); 507 vec4 = in30 - in25; 508 ST_SH(vec4, interm_ptr + 48); 509 vec4 = in31 - in24; 510 ST_SH(vec4, interm_ptr + 72); 511 vec4 = in16 - in23; 512 ST_SH(vec4, interm_ptr + 80); 513 514 ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); 515 DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); 516 DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); 517 518 ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); 519 DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); 520 ADD2(in27, in26, in25, in24, in23, in20); 521 522 DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); 523 FDCT_POSTPROC_2V_NEG_H(vec5, vec4); 524 ST_SH(vec5, out); 525 ST_SH(vec4, out + 120); 526 527 SUB2(in27, in26, in25, in24, in22, in21); 528 529 DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); 530 FDCT_POSTPROC_2V_NEG_H(vec5, vec4); 531 ST_SH(vec5, out + 112); 532 ST_SH(vec4, out + 8); 533 534 SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); 535 DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); 536 SUB2(in26, in27, in24, in25, in23, in20); 537 538 DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); 539 FDCT_POSTPROC_2V_NEG_H(vec5, vec4); 540 ST_SH(vec4, out + 16); 541 ST_SH(vec5, out + 104); 542 543 ADD2(in26, in27, in24, in25, in22, in21); 544 DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); 545 FDCT_POSTPROC_2V_NEG_H(vec5, vec4); 546 ST_SH(vec4, out + 24); 547 ST_SH(vec5, out + 96); 548 549 in20 = LD_SH(interm_ptr + 32); 550 in21 = LD_SH(interm_ptr + 88); 551 in27 = LD_SH(interm_ptr + 56); 552 in26 = LD_SH(interm_ptr + 64); 553 554 in16 = in20; 555 in17 = in21; 556 DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); 557 DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); 558 559 in22 = LD_SH(interm_ptr + 40); 560 in25 = LD_SH(interm_ptr + 48); 561 in24 = LD_SH(interm_ptr + 72); 562 in23 = LD_SH(interm_ptr + 80); 563 564 SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); 565 DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); 566 ADD2(in28, in29, in31, in30, in16, in19); 567 DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); 568 FDCT_POSTPROC_2V_NEG_H(vec5, vec4); 569 ST_SH(vec5, out + 32); 570 ST_SH(vec4, out + 88); 571 572 SUB2(in28, in29, in31, in30, in17, in18); 573 DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); 574 FDCT_POSTPROC_2V_NEG_H(vec5, vec4); 575 ST_SH(vec5, out + 40); 576 ST_SH(vec4, out + 80); 577 578 ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); 579 DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); 580 SUB2(in29, in28, in30, in31, in16, in19); 581 582 DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); 583 FDCT_POSTPROC_2V_NEG_H(vec5, vec4); 584 ST_SH(vec5, out + 72); 585 ST_SH(vec4, out + 48); 586 587 ADD2(in29, in28, in30, in31, in17, in18); 588 589 DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); 590 FDCT_POSTPROC_2V_NEG_H(vec5, vec4); 591 ST_SH(vec4, out + 56); 592 ST_SH(vec5, out + 64); 593 } 594 595 static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { 596 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 597 v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; 598 599 /* 1st set */ 600 in0 = LD_SH(temp); 601 in4 = LD_SH(temp + 32); 602 in2 = LD_SH(temp + 64); 603 in6 = LD_SH(temp + 96); 604 in1 = LD_SH(temp + 128); 605 in7 = LD_SH(temp + 152); 606 in3 = LD_SH(temp + 192); 607 in5 = LD_SH(temp + 216); 608 609 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 610 in4, in5, in6, in7); 611 612 /* 2nd set */ 613 in0_1 = LD_SH(temp + 16); 614 in1_1 = LD_SH(temp + 232); 615 in2_1 = LD_SH(temp + 80); 616 in3_1 = LD_SH(temp + 168); 617 in4_1 = LD_SH(temp + 48); 618 in5_1 = LD_SH(temp + 176); 619 in6_1 = LD_SH(temp + 112); 620 in7_1 = LD_SH(temp + 240); 621 622 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32); 623 TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, 624 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); 625 626 /* 3rd set */ 627 in0 = LD_SH(temp + 8); 628 in1 = LD_SH(temp + 136); 629 in2 = LD_SH(temp + 72); 630 in3 = LD_SH(temp + 200); 631 in4 = LD_SH(temp + 40); 632 in5 = LD_SH(temp + 208); 633 in6 = LD_SH(temp + 104); 634 in7 = LD_SH(temp + 144); 635 636 ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8, 637 32); 638 TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 639 in4, in5, in6, in7); 640 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32); 641 642 /* 4th set */ 643 in0_1 = LD_SH(temp + 24); 644 in1_1 = LD_SH(temp + 224); 645 in2_1 = LD_SH(temp + 88); 646 in3_1 = LD_SH(temp + 160); 647 in4_1 = LD_SH(temp + 56); 648 in5_1 = LD_SH(temp + 184); 649 in6_1 = LD_SH(temp + 120); 650 in7_1 = LD_SH(temp + 248); 651 652 TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, 653 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); 654 ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24, 655 32); 656 } 657 658 static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) { 659 fdct8x32_1d_row_load_butterfly(temp, temp_buf); 660 fdct8x32_1d_row_even(temp_buf, temp_buf); 661 fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128); 662 fdct8x32_1d_row_transpose_store(temp_buf, output); 663 } 664 665 static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf, 666 int16_t *output) { 667 fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); 668 fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf); 669 fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128); 670 fdct8x32_1d_row_transpose_store(tmp_buf, output); 671 } 672 673 void vpx_fdct32x32_msa(const int16_t *input, int16_t *output, 674 int32_t src_stride) { 675 int32_t i; 676 DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); 677 DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); 678 679 /* column transform */ 680 for (i = 0; i < 4; ++i) { 681 fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf, 682 tmp_buf_big + (8 * i)); 683 } 684 685 /* row transform */ 686 fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output); 687 688 /* row transform */ 689 for (i = 1; i < 4; ++i) { 690 fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256)); 691 } 692 } 693 694 static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { 695 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 696 v8i16 in8, in9, in10, in11, in12, in13, in14, in15; 697 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; 698 699 /* fdct32 even */ 700 /* stage 2 */ 701 LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); 702 LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); 703 704 BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, 705 in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, 706 vec7, in8, in9, in10, in11, in12, in13, in14, in15); 707 FDCT_POSTPROC_2V_NEG_H(vec0, vec1); 708 FDCT_POSTPROC_2V_NEG_H(vec2, vec3); 709 FDCT_POSTPROC_2V_NEG_H(vec4, vec5); 710 FDCT_POSTPROC_2V_NEG_H(vec6, vec7); 711 FDCT_POSTPROC_2V_NEG_H(in8, in9); 712 FDCT_POSTPROC_2V_NEG_H(in10, in11); 713 FDCT_POSTPROC_2V_NEG_H(in12, in13); 714 FDCT_POSTPROC_2V_NEG_H(in14, in15); 715 716 /* Stage 3 */ 717 ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); 718 719 temp0 = in0 + in3; 720 in0 = in0 - in3; 721 in3 = in1 + in2; 722 in1 = in1 - in2; 723 724 DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); 725 ST_SH(temp0, out); 726 ST_SH(temp1, out + 8); 727 728 DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); 729 ST_SH(temp0, out + 16); 730 ST_SH(temp1, out + 24); 731 732 SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); 733 DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); 734 ADD2(vec4, vec5, vec7, vec6, vec0, vec1); 735 DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); 736 ST_SH(temp0, out + 32); 737 ST_SH(temp1, out + 56); 738 739 SUB2(vec4, vec5, vec7, vec6, vec4, vec7); 740 DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); 741 ST_SH(temp0, out + 40); 742 ST_SH(temp1, out + 48); 743 744 DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); 745 DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); 746 ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); 747 DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); 748 ADD2(in0, in1, in2, in3, vec0, vec7); 749 DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); 750 ST_SH(temp0, out + 64); 751 ST_SH(temp1, out + 120); 752 753 SUB2(in0, in1, in2, in3, in0, in2); 754 DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); 755 ST_SH(temp0, out + 72); 756 ST_SH(temp1, out + 112); 757 758 SUB2(in9, vec2, in14, vec5, vec2, vec5); 759 DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); 760 SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); 761 DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); 762 ST_SH(temp0, out + 80); 763 ST_SH(temp1, out + 104); 764 765 ADD2(in3, in2, in0, in1, vec3, vec4); 766 DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); 767 ST_SH(temp0, out + 96); 768 ST_SH(temp1, out + 88); 769 } 770 771 static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr, 772 int16_t *out) { 773 v8i16 in16, in17, in18, in19, in20, in21, in22, in23; 774 v8i16 in24, in25, in26, in27, in28, in29, in30, in31; 775 v8i16 vec4, vec5; 776 777 in20 = LD_SH(temp + 32); 778 in21 = LD_SH(temp + 40); 779 in26 = LD_SH(temp + 80); 780 in27 = LD_SH(temp + 88); 781 782 DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); 783 DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); 784 785 FDCT_POSTPROC_2V_NEG_H(in20, in21); 786 FDCT_POSTPROC_2V_NEG_H(in26, in27); 787 788 in18 = LD_SH(temp + 16); 789 in19 = LD_SH(temp + 24); 790 in28 = LD_SH(temp + 96); 791 in29 = LD_SH(temp + 104); 792 793 FDCT_POSTPROC_2V_NEG_H(in18, in19); 794 FDCT_POSTPROC_2V_NEG_H(in28, in29); 795 796 vec4 = in19 - in20; 797 ST_SH(vec4, interm_ptr + 32); 798 vec4 = in18 - in21; 799 ST_SH(vec4, interm_ptr + 88); 800 vec4 = in29 - in26; 801 ST_SH(vec4, interm_ptr + 64); 802 vec4 = in28 - in27; 803 ST_SH(vec4, interm_ptr + 56); 804 805 ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); 806 807 in22 = LD_SH(temp + 48); 808 in23 = LD_SH(temp + 56); 809 in24 = LD_SH(temp + 64); 810 in25 = LD_SH(temp + 72); 811 812 DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); 813 DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); 814 FDCT_POSTPROC_2V_NEG_H(in22, in23); 815 FDCT_POSTPROC_2V_NEG_H(in24, in25); 816 817 in16 = LD_SH(temp); 818 in17 = LD_SH(temp + 8); 819 in30 = LD_SH(temp + 112); 820 in31 = LD_SH(temp + 120); 821 822 FDCT_POSTPROC_2V_NEG_H(in16, in17); 823 FDCT_POSTPROC_2V_NEG_H(in30, in31); 824 825 vec4 = in17 - in22; 826 ST_SH(vec4, interm_ptr + 40); 827 vec4 = in30 - in25; 828 ST_SH(vec4, interm_ptr + 48); 829 vec4 = in31 - in24; 830 ST_SH(vec4, interm_ptr + 72); 831 vec4 = in16 - in23; 832 ST_SH(vec4, interm_ptr + 80); 833 834 ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); 835 DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); 836 DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); 837 ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); 838 DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); 839 ADD2(in27, in26, in25, in24, in23, in20); 840 DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); 841 ST_SH(vec5, out); 842 ST_SH(vec4, out + 120); 843 844 SUB2(in27, in26, in25, in24, in22, in21); 845 DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); 846 ST_SH(vec5, out + 112); 847 ST_SH(vec4, out + 8); 848 849 SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); 850 DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); 851 SUB2(in26, in27, in24, in25, in23, in20); 852 DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); 853 ST_SH(vec4, out + 16); 854 ST_SH(vec5, out + 104); 855 856 ADD2(in26, in27, in24, in25, in22, in21); 857 DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); 858 ST_SH(vec4, out + 24); 859 ST_SH(vec5, out + 96); 860 861 in20 = LD_SH(interm_ptr + 32); 862 in21 = LD_SH(interm_ptr + 88); 863 in27 = LD_SH(interm_ptr + 56); 864 in26 = LD_SH(interm_ptr + 64); 865 866 in16 = in20; 867 in17 = in21; 868 DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); 869 DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); 870 871 in22 = LD_SH(interm_ptr + 40); 872 in25 = LD_SH(interm_ptr + 48); 873 in24 = LD_SH(interm_ptr + 72); 874 in23 = LD_SH(interm_ptr + 80); 875 876 SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); 877 DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); 878 in16 = in28 + in29; 879 in19 = in31 + in30; 880 DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); 881 ST_SH(vec5, out + 32); 882 ST_SH(vec4, out + 88); 883 884 SUB2(in28, in29, in31, in30, in17, in18); 885 DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); 886 ST_SH(vec5, out + 40); 887 ST_SH(vec4, out + 80); 888 889 ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); 890 DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); 891 SUB2(in29, in28, in30, in31, in16, in19); 892 DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); 893 ST_SH(vec5, out + 72); 894 ST_SH(vec4, out + 48); 895 896 ADD2(in29, in28, in30, in31, in17, in18); 897 DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); 898 ST_SH(vec4, out + 56); 899 ST_SH(vec5, out + 64); 900 } 901 902 static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf, 903 int16_t *output) { 904 fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); 905 fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf); 906 fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128)); 907 fdct8x32_1d_row_transpose_store(tmp_buf, output); 908 } 909 910 void vpx_fdct32x32_rd_msa(const int16_t *input, int16_t *out, 911 int32_t src_stride) { 912 int32_t i; 913 DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); 914 DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); 915 916 /* column transform */ 917 for (i = 0; i < 4; ++i) { 918 fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0], 919 &tmp_buf_big[0] + (8 * i)); 920 } 921 922 /* row transform */ 923 for (i = 0; i < 4; ++i) { 924 fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0], 925 out + (8 * i * 32)); 926 } 927 } 928 929 void vpx_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) { 930 int sum, i; 931 v8i16 in0, in1, in2, in3, in4, in5, in6, in7; 932 v4i32 vec_w = { 0 }; 933 934 for (i = 0; i < 16; ++i) { 935 LD_SH4(input, 8, in0, in1, in2, in3); 936 input += stride; 937 LD_SH4(input, 8, in4, in5, in6, in7); 938 input += stride; 939 ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); 940 ADD2(in0, in2, in4, in6, in0, in4); 941 vec_w += __msa_hadd_s_w(in0, in0); 942 vec_w += __msa_hadd_s_w(in4, in4); 943 } 944 945 sum = HADD_SW_S32(vec_w); 946 out[0] = (int16_t)(sum >> 3); 947 } 948