1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevc_32x32_itrans_recon_x86_intr.c 22 * 23 * @brief 24 * Contains function definitions for inverse quantization, inverse 25 * transform and reconstruction 26 * 27 * @author 28 * 100470 29 * 30 * @par List of Functions: 31 * - ihevc_itrans_recon_32x32_sse42() 32 * 33 * @remarks 34 * None 35 * 36 ******************************************************************************* 37 */ 38 #include <stdio.h> 39 #include <string.h> 40 #include "ihevc_typedefs.h" 41 #include "ihevc_platform_macros.h" 42 #include "ihevc_macros.h" 43 #include "ihevc_defs.h" 44 #include "ihevc_trans_tables.h" 45 #include "ihevc_iquant_itrans_recon.h" 46 #include "ihevc_func_selector.h" 47 #include "ihevc_trans_macros.h" 48 49 #include <emmintrin.h> 50 #include <smmintrin.h> 51 #include <tmmintrin.h> 52 53 /** 54 ******************************************************************************* 55 * 56 * @brief 57 * This function performs inverse quantization, inverse transform and 58 * reconstruction for 16x16 input block 59 * 60 * @par Description: 61 * Performs inverse quantization , inverse transform and adds the 62 * prediction data and clips output to 8 bit 63 * 64 * @param[in] pi2_src 65 * Input 16x16 coefficients 66 * 67 * @param[in] pi2_tmp 68 * Temporary 16x16 buffer for storing inverse 69 * transform 1st stage output 70 * 71 * @param[in] pu1_pred 72 * Prediction 16x16 block 73 * 74 * @param[in] pi2_dequant_coeff 75 * Dequant Coeffs 76 * 77 * @param[out] pu1_dst 78 * Output 16x16 block 79 * 80 * @param[in] qp_div 81 * Quantization parameter / 6 82 * 83 * @param[in] qp_rem 84 * Quantization parameter % 6 85 * 86 * @param[in] src_strd 87 * Input stride 88 * 89 * @param[in] pred_strd 90 * Prediction stride 91 * 92 * @param[in] dst_strd 93 * Output Stride 94 * 95 * @param[in] zero_cols 96 * Zero columns in pi2_src 97 * 98 * @returns Void 99 * 100 * @remarks 101 * None 102 * 103 ******************************************************************************* 104 */ 105 /**/ 106 107 void ihevc_itrans_recon_32x32_sse42(WORD16 *pi2_src, 108 WORD16 *pi2_tmp, 109 UWORD8 *pu1_pred, 110 UWORD8 *pu1_dst, 111 WORD32 src_strd, 112 WORD32 pred_strd, 113 WORD32 dst_strd, 114 WORD32 zero_cols, 115 WORD32 zero_rows) 116 { 117 /* Inverse Transform */ 118 119 WORD32 j; 120 121 122 WORD16 *pi2_tmp_orig; 123 124 125 WORD16 *o_temp_ptr; 126 WORD16 *temp_ptr; 127 128 __m128i m_temp_reg_0; 129 __m128i m_temp_reg_1; 130 __m128i m_temp_reg_2; 131 __m128i m_temp_reg_3; 132 __m128i m_temp_reg_4; 133 __m128i m_temp_reg_5; 134 __m128i m_temp_reg_6; 135 __m128i m_temp_reg_7; 136 __m128i m_temp_reg_10; 137 __m128i m_temp_reg_11; 138 __m128i m_temp_reg_12; 139 __m128i m_temp_reg_13; 140 __m128i m_temp_reg_14; 141 __m128i m_temp_reg_15; 142 __m128i m_temp_reg_16; 143 __m128i m_temp_reg_17; 144 __m128i m_temp_reg_18; 145 __m128i m_temp_reg_19; 146 __m128i m_temp_reg_20; 147 __m128i m_temp_reg_21; 148 __m128i m_temp_reg_22; 149 __m128i m_temp_reg_23; 150 __m128i m_temp_reg_30; 151 __m128i m_temp_reg_31; 152 __m128i m_temp_reg_32; 153 __m128i m_temp_reg_33; 154 __m128i m_temp_reg_34; 155 __m128i m_temp_reg_35; 156 __m128i m_temp_reg_36; 157 __m128i m_temp_reg_37; 158 __m128i m_temp_reg_40; 159 __m128i m_temp_reg_41; 160 __m128i m_temp_reg_42; 161 __m128i m_temp_reg_43; 162 __m128i m_temp_reg_44; 163 __m128i m_temp_reg_45; 164 __m128i m_temp_reg_46; 165 __m128i m_temp_reg_47; 166 167 __m128i m_temp_reg_70; 168 __m128i m_temp_reg_71; 169 __m128i m_temp_reg_72; 170 __m128i m_temp_reg_73; 171 __m128i m_temp_reg_74; 172 __m128i m_temp_reg_75; 173 __m128i m_temp_reg_76; 174 __m128i m_temp_reg_77; 175 176 __m128i m_temp_reg_80; 177 __m128i m_temp_reg_81; 178 __m128i m_temp_reg_82; 179 __m128i m_temp_reg_83; 180 __m128i m_temp_reg_84; 181 __m128i m_temp_reg_85; 182 __m128i m_temp_reg_86; 183 __m128i m_temp_reg_87; 184 185 __m128i m_temp_reg_90; 186 __m128i m_temp_reg_91; 187 __m128i m_temp_reg_92; 188 __m128i m_temp_reg_93; 189 __m128i m_temp_reg_94; 190 __m128i m_temp_reg_95; 191 __m128i m_temp_reg_96; 192 __m128i m_temp_reg_97; 193 194 __m128i m_rdng_factor; 195 __m128i m_count; 196 __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4; 197 __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8; 198 199 __m128i temp1, temp2, temp3, temp4; 200 __m128i temp5, temp6, temp7, temp8; 201 202 __m128i all_zero_reg; 203 WORD32 i; 204 205 /*Lokesh*/ 206 WORD32 zero_last24_cols_stg1; 207 WORD32 zero_last24_rows_stg1; 208 WORD32 zero_last28_rows_stg1; 209 210 WORD32 zero_last28_rows_stg2; 211 WORD32 zero_last24_rows_stg2; 212 213 WORD32 trans_size_stg1; 214 215 WORD32 i4_shift = IT_SHIFT_STAGE_1; 216 WORD32 trans_size = TRANS_SIZE_32; 217 218 219 /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */ 220 zero_last24_cols_stg1 = ((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0; 221 zero_last24_rows_stg1 = ((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0; 222 zero_last28_rows_stg1 = ((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0; 223 224 zero_last28_rows_stg2 = ((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0; 225 zero_last24_rows_stg2 = zero_last24_cols_stg1; 226 227 if((zero_last28_rows_stg2) || (zero_last24_cols_stg1)) 228 { 229 trans_size_stg1 = 8; 230 231 } 232 else 233 { 234 trans_size_stg1 = 32; 235 } 236 237 all_zero_reg = _mm_setzero_si128(); 238 239 o_temp_ptr = pi2_tmp; 240 temp_ptr = (pi2_tmp + 1024); 241 242 pi2_tmp += 2048; 243 pi2_tmp_orig = pi2_tmp; 244 245 for(i = 0; i < trans_size_stg1; i += 8) 246 { 247 248 { 249 WORD16 *pi2_tmp_src = pi2_src; 250 251 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 252 pi2_tmp_src += (src_strd << 1); 253 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 254 pi2_tmp_src += (src_strd << 1); 255 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 256 pi2_tmp_src += (src_strd << 1); 257 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 258 pi2_tmp_src += (src_strd << 1); 259 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 260 pi2_tmp_src += (src_strd << 1); 261 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 262 pi2_tmp_src += (src_strd << 1); 263 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 264 pi2_tmp_src += (src_strd << 1); 265 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 266 pi2_tmp_src += (src_strd << 1); 267 268 m_temp_reg_80 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 269 pi2_tmp_src += (src_strd << 1); 270 m_temp_reg_81 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 271 pi2_tmp_src += (src_strd << 1); 272 m_temp_reg_82 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 273 pi2_tmp_src += (src_strd << 1); 274 m_temp_reg_83 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 275 pi2_tmp_src += (src_strd << 1); 276 m_temp_reg_84 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 277 pi2_tmp_src += (src_strd << 1); 278 m_temp_reg_85 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 279 pi2_tmp_src += (src_strd << 1); 280 m_temp_reg_86 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 281 pi2_tmp_src += (src_strd << 1); 282 m_temp_reg_87 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 283 } 284 285 if(zero_last28_rows_stg1) 286 { 287 /* eeo */ 288 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 289 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 290 { 291 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 292 293 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 294 295 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 296 297 /* eeeo[0]= m_temp_reg_20 */ 298 /* eeeo[1]= m_temp_reg_21 */ 299 /* eeee[0]= m_temp_reg_22 */ 300 /* eeee[1]= m_temp_reg_23 */ 301 302 /* eee[0] = eeee[0] + eeeo[0]; */ 303 m_temp_reg_40 = m_temp_reg_14; 304 305 /* eee[3] = eeee[0] - eeeo[0]; */ 306 m_temp_reg_43 = m_temp_reg_14; 307 308 /* eee[2] = eeee[1] - eeeo[1]; */ 309 m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16; 310 311 /* eee[1] = eeee[1] + eeeo[1];*/ 312 m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16; 313 314 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 315 316 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 317 318 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 319 320 /* eeeo[0]= m_temp_reg_20 */ 321 /* eeeo[1]= m_temp_reg_21 */ 322 /* eeee[0]= m_temp_reg_22 */ 323 /* eeee[1]= m_temp_reg_23 */ 324 325 /* eee[0] = eeee[0] + eeeo[0]; */ 326 m_temp_reg_44 = m_temp_reg_14; 327 328 /* eee[3] = eeee[0] - eeeo[0]; */ 329 m_temp_reg_47 = m_temp_reg_14; 330 331 /* eee[2] = eeee[1] - eeeo[1]; */ 332 m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16; 333 334 /* eee[1] = eeee[1] + eeeo[1];*/ 335 m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16; 336 337 338 } 339 /* eo */ 340 { 341 WORD16 *pi2_scratch = o_temp_ptr; 342 343 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 344 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 345 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 346 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 347 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 348 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 349 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 350 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9 351 352 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg); 353 354 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 355 356 /* eo0[0-3] */ 357 { 358 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 359 360 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg); 361 362 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 363 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 364 365 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 366 pi2_scratch += 8; 367 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 368 pi2_scratch += 8; 369 370 } 371 372 /* eo0[4-7] */ 373 { 374 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 375 376 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 377 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 378 379 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 380 pi2_scratch += 8; 381 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 382 pi2_scratch += 8; 383 384 } 385 /* eo1[0-3] */ 386 { 387 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2); 388 389 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30); 390 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30); 391 392 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 393 pi2_scratch += 8; 394 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 395 pi2_scratch += 8; 396 397 } 398 399 /* eo1[4-7] */ 400 { 401 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2); 402 403 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30); 404 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30); 405 406 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 407 pi2_scratch += 8; 408 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 409 pi2_scratch += 8; 410 411 } 412 413 /* eo2[0-3] */ 414 { 415 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 416 417 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 418 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 419 420 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 421 pi2_scratch += 8; 422 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 423 pi2_scratch += 8; 424 425 } 426 427 /* eo2[4-7] */ 428 { 429 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 430 431 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 432 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 433 434 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 435 pi2_scratch += 8; 436 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 437 pi2_scratch += 8; 438 439 } 440 441 /**************************************************************************/ 442 443 444 /* eo3[0-3] */ 445 { 446 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4); 447 448 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30); 449 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30); 450 451 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 452 pi2_scratch += 8; 453 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 454 pi2_scratch += 8; 455 456 } 457 458 /* eo3[4-7] */ 459 { 460 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4); 461 462 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30); 463 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30); 464 465 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 466 pi2_scratch += 8; 467 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 468 pi2_scratch += 8; 469 470 } 471 472 473 /* eo4[0-3] */ 474 { 475 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 476 477 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30); 478 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30); 479 480 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 481 pi2_scratch += 8; 482 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 483 pi2_scratch += 8; 484 485 } 486 /* eo4[4-7] */ 487 { 488 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 489 490 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30); 491 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30); 492 493 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 494 pi2_scratch += 8; 495 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 496 pi2_scratch += 8; 497 498 } 499 500 /***********************************************************************/ 501 502 /* eo5[0-3] */ 503 { 504 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff6); 505 506 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 507 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 508 509 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 510 pi2_scratch += 8; 511 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 512 pi2_scratch += 8; 513 514 } 515 516 517 /* eo5[4-7] */ 518 { 519 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff6); 520 521 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 522 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 523 524 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 525 pi2_scratch += 8; 526 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 527 pi2_scratch += 8; 528 529 } 530 531 /* eo6[0-3] */ 532 { 533 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff7); 534 535 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30); 536 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30); 537 538 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 539 pi2_scratch += 8; 540 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 541 pi2_scratch += 8; 542 543 } 544 545 546 /* eo6[4-7] */ 547 { 548 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff7); 549 550 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30); 551 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30); 552 553 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 554 pi2_scratch += 8; 555 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 556 pi2_scratch += 8; 557 558 } 559 560 561 /* eo7[0-3] */ 562 { 563 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff8); 564 565 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 566 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 567 568 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 569 pi2_scratch += 8; 570 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 571 pi2_scratch += 8; 572 573 } 574 575 576 /* eo7[4-7] */ 577 { 578 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff8); 579 580 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 581 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 582 583 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 584 pi2_scratch += 8; 585 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 586 pi2_scratch += 8; 587 588 } 589 590 } 591 } 592 else if(zero_last24_rows_stg1) 593 { 594 { 595 /* eeo */ 596 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 597 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 598 599 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36 600 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83 601 602 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64 603 604 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 605 606 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 607 608 /* eeeo[0]= m_temp_reg_20 */ 609 /* eeeo[1]= m_temp_reg_21 */ 610 /* eeee[0]= m_temp_reg_22 */ 611 /* eeee[1]= m_temp_reg_23 */ 612 613 /* eee[0] = eeee[0] + eeeo[0]; */ 614 m_temp_reg_40 = m_temp_reg_14; 615 616 /* eee[3] = eeee[0] - eeeo[0]; */ 617 m_temp_reg_43 = m_temp_reg_14; 618 619 /* eee[2] = eeee[1] - eeeo[1]; */ 620 m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16; 621 622 /* eee[1] = eeee[1] + eeeo[1];*/ 623 m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16; 624 625 /* for row 4 to 7 */ 626 627 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 628 629 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 630 631 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 632 633 /* eeeo[0]= m_temp_reg_20 */ 634 /* eeeo[1]= m_temp_reg_21 */ 635 /* eeee[0]= m_temp_reg_22 */ 636 /* eeee[1]= m_temp_reg_23 */ 637 638 /* eee[0] = eeee[0] + eeeo[0]; */ 639 m_temp_reg_44 = m_temp_reg_14; 640 641 /* eee[3] = eeee[0] - eeeo[0]; */ 642 m_temp_reg_47 = m_temp_reg_14; 643 644 /* eee[2] = eeee[1] - eeeo[1]; */ 645 m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16; 646 647 /* eee[1] = eeee[1] + eeeo[1];*/ 648 m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16; 649 650 651 // eeo[] 652 /* for(k = 0; k < 4; k++) */ 653 654 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75 655 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 656 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 657 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18 658 659 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg); 660 661 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8); 662 663 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg); 664 665 m_temp_reg_33 = _mm_setzero_si128(); 666 667 /* eeo */ 668 { 669 /* eeo0[0-3] */ 670 { 671 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 672 673 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 674 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 675 676 m_temp_reg_90 = m_temp_reg_34; 677 m_temp_reg_97 = m_temp_reg_35; 678 } 679 /* eeo0[4-7] */ 680 { 681 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 682 683 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 684 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 685 686 m_temp_reg_91 = m_temp_reg_34; 687 m_temp_reg_96 = m_temp_reg_35; 688 689 } 690 691 /* eeo1[0-3] */ 692 { 693 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2); 694 695 /* e[1][0-3] stored in pi2_tmp[2][0-7] */ 696 /* e[6][0-3] stored in pi2_tmp[2][8-15] */ 697 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30); 698 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30); 699 700 m_temp_reg_92 = m_temp_reg_34; 701 m_temp_reg_95 = m_temp_reg_35; 702 703 } 704 705 /* eo1[4-7] */ 706 { 707 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2); 708 709 /* e[1][4-7] stored in pi2_tmp[3][0-7] */ 710 /* e[6][4-7] stored in pi2_tmp[3][8-15] */ 711 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30); 712 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30); 713 714 m_temp_reg_93 = m_temp_reg_34; 715 m_temp_reg_94 = m_temp_reg_35; 716 717 718 } 719 720 /* eo2[0-3] */ 721 { 722 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4); 723 724 /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 725 /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 726 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 727 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 728 729 temp1 = m_temp_reg_34; 730 temp7 = m_temp_reg_35; 731 732 } 733 734 /* eo2[4-7] */ 735 { 736 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4); 737 738 /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 739 /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 740 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 741 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 742 743 temp2 = m_temp_reg_34; 744 temp6 = m_temp_reg_35; 745 746 } 747 748 /* eo3[0-3] */ 749 { 750 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 751 752 /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 753 /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 754 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30); 755 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30); 756 757 temp3 = m_temp_reg_34; 758 temp5 = m_temp_reg_35; 759 760 } 761 762 763 /* eo3[4-7] */ 764 { 765 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 766 767 /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 768 /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 769 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30); 770 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30); 771 772 temp4 = m_temp_reg_34; 773 temp8 = m_temp_reg_35; 774 775 776 } 777 /* All values of ee[] array in pi2_temp */ 778 779 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87 780 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70 781 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43 782 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9 783 784 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 785 786 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 787 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8); 788 789 } 790 } 791 /* eo */ 792 { 793 794 WORD16 *pi2_scratch = o_temp_ptr; 795 796 /* eo0[0-3] */ 797 { 798 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 799 800 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30); 801 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30); 802 803 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 804 pi2_scratch += 8; 805 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 806 pi2_scratch += 8; 807 808 } 809 810 811 /* eo0[4-7] */ 812 { 813 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 814 815 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 816 817 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30); 818 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30); 819 820 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 821 pi2_scratch += 8; 822 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 823 pi2_scratch += 8; 824 825 } 826 827 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57 828 829 /* eo1[0-3] */ 830 { 831 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 832 833 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30); 834 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30); 835 836 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 837 pi2_scratch += 8; 838 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 839 pi2_scratch += 8; 840 841 } 842 843 844 /* eo1[4-7] */ 845 { 846 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 847 848 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30); 849 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30); 850 851 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 852 pi2_scratch += 8; 853 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 854 pi2_scratch += 8; 855 856 } 857 858 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9 859 860 /* eo2[0-3] */ 861 { 862 863 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 864 865 m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30); 866 m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30); 867 868 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 869 pi2_scratch += 8; 870 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 871 pi2_scratch += 8; 872 873 } 874 875 /* eo2[4-7] */ 876 { 877 878 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 879 880 m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30); 881 m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30); 882 883 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 884 pi2_scratch += 8; 885 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 886 pi2_scratch += 8; 887 888 } 889 890 /**************************************************************************/ 891 892 893 894 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43 895 896 /* eo3[0-3] */ 897 { 898 899 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 900 901 m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30); 902 m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30); 903 904 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 905 pi2_scratch += 8; 906 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 907 pi2_scratch += 8; 908 909 } 910 911 912 /* eo3[4-7] */ 913 { 914 915 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 916 917 m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30); 918 m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30); 919 920 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 921 pi2_scratch += 8; 922 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 923 pi2_scratch += 8; 924 925 } 926 927 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80 928 929 /* eo4[0-3] */ 930 { 931 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 932 933 m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30); 934 m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30); 935 936 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 937 pi2_scratch += 8; 938 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 939 pi2_scratch += 8; 940 941 } 942 /* eo4[4-7] */ 943 { 944 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 945 946 m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30); 947 m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30); 948 949 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 950 pi2_scratch += 8; 951 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 952 pi2_scratch += 8; 953 954 } 955 956 /***********************************************************************/ 957 958 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90 959 960 /* eo5[0-3] */ 961 { 962 963 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 964 965 m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30); 966 m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30); 967 968 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 969 pi2_scratch += 8; 970 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 971 pi2_scratch += 8; 972 973 } 974 975 976 /* eo5[4-7] */ 977 { 978 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 979 980 m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30); 981 m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30); 982 983 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 984 pi2_scratch += 8; 985 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 986 pi2_scratch += 8; 987 988 } 989 990 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70 991 992 /* eo6[0-3] */ 993 { 994 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 995 996 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30); 997 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30); 998 999 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1000 pi2_scratch += 8; 1001 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1002 pi2_scratch += 8; 1003 1004 } 1005 1006 1007 /* eo6[4-7] */ 1008 { 1009 1010 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1011 1012 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30); 1013 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30); 1014 1015 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1016 pi2_scratch += 8; 1017 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1018 pi2_scratch += 8; 1019 1020 } 1021 1022 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25 1023 1024 /* eo7[0-3] */ 1025 { 1026 1027 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1028 1029 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30); 1030 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30); 1031 1032 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1033 pi2_scratch += 8; 1034 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1035 pi2_scratch += 8; 1036 1037 } 1038 1039 1040 /* eo7[4-7] */ 1041 { 1042 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1043 1044 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30); 1045 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30); 1046 1047 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1048 pi2_scratch += 8; 1049 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1050 pi2_scratch += 8; 1051 1052 } 1053 1054 } 1055 1056 } 1057 else 1058 { 1059 1060 { 1061 /* eeo */ 1062 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 1063 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 1064 1065 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36 1066 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83 1067 1068 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64 1069 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64 1070 1071 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84); 1072 1073 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80); 1074 1075 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */ 1076 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */ 1077 1078 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */ 1079 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */ 1080 1081 1082 /* eeeo[0]= m_temp_reg_20 */ 1083 /* eeeo[1]= m_temp_reg_21 */ 1084 /* eeee[0]= m_temp_reg_22 */ 1085 /* eeee[1]= m_temp_reg_23 */ 1086 1087 /* eee[0] = eeee[0] + eeeo[0]; */ 1088 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */ 1089 1090 /* eee[3] = eeee[0] - eeeo[0]; */ 1091 m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */ 1092 1093 /* eee[2] = eeee[1] - eeeo[1]; */ 1094 m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */ 1095 1096 /* eee[1] = eeee[1] + eeeo[1];*/ 1097 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */ 1098 1099 /* for row 4 to 7 */ 1100 1101 m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8); 1102 m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8); 1103 1104 /* Interleaving row 8 and row 24*/ 1105 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84); 1106 1107 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 1108 m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8); 1109 1110 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80); 1111 1112 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */ 1113 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */ 1114 1115 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */ 1116 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */ 1117 1118 1119 /* eeeo[0]= m_temp_reg_20 */ 1120 /* eeeo[1]= m_temp_reg_21 */ 1121 /* eeee[0]= m_temp_reg_22 */ 1122 /* eeee[1]= m_temp_reg_23 */ 1123 1124 /* eee[0] = eeee[0] + eeeo[0]; */ 1125 m_temp_reg_44 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */ 1126 1127 /* eee[3] = eeee[0] - eeeo[0]; */ 1128 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */ 1129 1130 /* eee[2] = eeee[1] - eeeo[1]; */ 1131 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */ 1132 1133 /* eee[1] = eeee[1] + eeeo[1];*/ 1134 m_temp_reg_45 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */ 1135 1136 1137 // eeo[] 1138 /* for(k = 0; k < 4; k++) */ 1139 1140 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75 1141 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18 1142 1143 /* eeo */ 1144 { 1145 /* eeo0[0-3] */ 1146 { 1147 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 1148 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86); 1149 1150 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1151 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1152 1153 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1154 1155 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 1156 m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 1157 1158 } 1159 1160 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8); 1161 m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8); 1162 m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8); 1163 m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8); 1164 1165 /* eeo0[4-7] */ 1166 { 1167 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 1168 m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86); 1169 1170 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1171 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1172 1173 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1174 1175 m_temp_reg_91 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 1176 m_temp_reg_96 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 1177 1178 } 1179 1180 1181 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18 1182 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89 50 1183 1184 /* eeo1[0-3] */ 1185 { 1186 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 1187 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 1188 1189 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30); 1190 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30); 1191 1192 m_temp_reg_92 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31); 1193 m_temp_reg_95 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31); 1194 1195 } 1196 1197 /* eeo1[4-7] */ 1198 { 1199 1200 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 1201 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4); 1202 1203 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30); 1204 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30); 1205 1206 m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31); 1207 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31); 1208 1209 1210 } 1211 1212 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89 1213 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 75 1214 1215 /* eeo2[0-3] */ 1216 { 1217 1218 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 1219 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 1220 1221 /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 1222 /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 1223 1224 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 1225 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 1226 1227 temp1 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31); 1228 temp7 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31); 1229 1230 } 1231 1232 /* eeo2[4-7] */ 1233 { 1234 1235 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 1236 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4); 1237 1238 /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 1239 /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 1240 1241 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 1242 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 1243 1244 temp2 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31); 1245 temp6 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31); 1246 1247 } 1248 1249 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50 1250 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75 -89 1251 1252 /* eeo3[0-3] */ 1253 { 1254 1255 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 1256 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 1257 1258 /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 1259 /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 1260 1261 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30); 1262 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30); 1263 1264 temp3 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31); 1265 temp5 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31); 1266 1267 1268 } 1269 1270 /* eeo3[4-7] */ 1271 { 1272 1273 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 1274 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4); 1275 1276 /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 1277 /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 1278 1279 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30); 1280 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30); 1281 temp4 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31); 1282 temp8 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31); 1283 1284 } 1285 1286 1287 /* All values of ee[] array in pi2_temp */ 1288 1289 /* for(k = 0; k < 8; k++) */ 1290 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87 1291 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70 1292 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43 1293 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9 1294 } 1295 } 1296 /* eo */ 1297 { 1298 1299 WORD16 *pi2_scratch = o_temp_ptr; 1300 1301 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 1302 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 1303 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83); 1304 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87); 1305 1306 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 1307 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8); 1308 m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8); 1309 m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8); 1310 1311 m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8); 1312 m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8); 1313 m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8); 1314 m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8); 1315 1316 /* eo0[0-3] */ 1317 { 1318 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1319 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1320 1321 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1322 1323 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1324 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1325 1326 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1327 1328 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1329 1330 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30); 1331 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30); 1332 1333 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1334 pi2_scratch += 8; 1335 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1336 pi2_scratch += 8; 1337 1338 } 1339 /* eo0[4-7] */ 1340 { 1341 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 1342 m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 1343 m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83); 1344 m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87); 1345 1346 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1347 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1348 1349 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1350 1351 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 1352 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 1353 1354 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1355 1356 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1357 1358 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30); 1359 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30); 1360 1361 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1362 pi2_scratch += 8; 1363 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1364 pi2_scratch += 8; 1365 1366 } 1367 1368 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57 1369 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0 -43 1370 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 90 1371 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 25 1372 1373 /* eo1[0-3] */ 1374 { 1375 1376 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1377 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1378 1379 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1380 1381 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1382 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1383 1384 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1385 1386 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32); 1387 1388 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30); 1389 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30); 1390 1391 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1392 pi2_scratch += 8; 1393 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1394 pi2_scratch += 8; 1395 1396 } 1397 1398 /* eo1[4-7] */ 1399 { 1400 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1401 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1402 1403 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1404 1405 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 1406 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 1407 1408 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1409 1410 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32); 1411 1412 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30); 1413 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30); 1414 1415 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1416 pi2_scratch += 8; 1417 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1418 pi2_scratch += 8; 1419 1420 } 1421 1422 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9 1423 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70 87 1424 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25 57 1425 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90 43 1426 1427 /* eo2[0-3] */ 1428 { 1429 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1430 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1431 1432 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31); 1433 1434 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1435 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1436 1437 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1438 1439 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1440 1441 m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30); 1442 m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30); 1443 1444 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1445 pi2_scratch += 8; 1446 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1447 pi2_scratch += 8; 1448 1449 } 1450 1451 1452 /* eo2[4-7] */ 1453 { 1454 1455 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1456 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1457 1458 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31); 1459 1460 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 1461 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 1462 1463 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1464 1465 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1466 1467 m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30); 1468 m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30); 1469 1470 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1471 pi2_scratch += 8; 1472 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1473 pi2_scratch += 8; 1474 1475 } 1476 /**************************************************************************/ 1477 1478 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43 1479 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87 9 1480 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90 25 1481 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80 57 1482 1483 /* eo3[0-3] */ 1484 { 1485 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1486 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1487 1488 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1489 1490 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1491 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1492 1493 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33); 1494 1495 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1496 1497 m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30); 1498 m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30); 1499 1500 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1501 pi2_scratch += 8; 1502 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1503 pi2_scratch += 8; 1504 1505 } 1506 1507 1508 /* eo3[4-7] */ 1509 { 1510 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1511 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1512 1513 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1514 1515 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 1516 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 1517 1518 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33); 1519 1520 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1521 1522 m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30); 1523 m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30); 1524 1525 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1526 pi2_scratch += 8; 1527 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1528 pi2_scratch += 8; 1529 1530 } 1531 1532 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80 1533 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25 90 1534 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9 87 1535 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 70 1536 1537 /* eo4[0-3] */ 1538 { 1539 1540 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1541 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1542 1543 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1544 1545 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1546 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1547 1548 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32); 1549 1550 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1551 1552 m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30); 1553 m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30); 1554 1555 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1556 pi2_scratch += 8; 1557 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1558 pi2_scratch += 8; 1559 1560 } 1561 1562 1563 /* eo4[4-7] */ 1564 { 1565 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1566 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1567 1568 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1569 1570 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 1571 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 1572 1573 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32); 1574 1575 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1576 1577 m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30); 1578 m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30); 1579 1580 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1581 pi2_scratch += 8; 1582 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1583 pi2_scratch += 8; 1584 1585 } 1586 1587 /***********************************************************************/ 1588 1589 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90 1590 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57 25 1591 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87 70 1592 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9 -80 1593 1594 /* eo5[0-3] */ 1595 { 1596 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1597 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1598 1599 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1600 1601 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1602 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1603 1604 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1605 1606 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1607 1608 m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30); 1609 m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30); 1610 1611 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1612 pi2_scratch += 8; 1613 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1614 pi2_scratch += 8; 1615 1616 } 1617 1618 1619 /* eo5[4-7] */ 1620 { 1621 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1622 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1623 1624 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1625 1626 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 1627 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 1628 1629 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1630 1631 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1632 1633 m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30); 1634 m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30); 1635 1636 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1637 pi2_scratch += 8; 1638 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1639 pi2_scratch += 8; 1640 1641 } 1642 1643 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70 1644 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90 -80 1645 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43 9 1646 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57 87 1647 1648 /* eo6[0-3] */ 1649 { 1650 1651 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1652 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1653 1654 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1655 1656 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1657 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1658 1659 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1660 1661 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1662 1663 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30); 1664 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30); 1665 1666 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1667 pi2_scratch += 8; 1668 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1669 pi2_scratch += 8; 1670 1671 } 1672 1673 1674 /* eo6[4-7] */ 1675 { 1676 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1677 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1678 1679 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1680 1681 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 1682 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 1683 1684 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1685 1686 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1687 1688 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30); 1689 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30); 1690 1691 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1692 pi2_scratch += 8; 1693 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1694 pi2_scratch += 8; 1695 1696 } 1697 1698 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25 1699 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43 -57 1700 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70 -80 1701 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87 -90 1702 1703 /* eo7[0-3] */ 1704 { 1705 1706 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1707 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1708 1709 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1710 1711 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1712 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1713 1714 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1715 1716 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1717 1718 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30); 1719 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30); 1720 1721 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1722 pi2_scratch += 8; 1723 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1724 pi2_scratch += 8; 1725 1726 } 1727 1728 1729 /* eo7[4-7] */ 1730 { 1731 1732 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1733 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1734 1735 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1736 1737 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 1738 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 1739 1740 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1741 1742 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1743 1744 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30); 1745 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30); 1746 1747 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1748 pi2_scratch += 8; 1749 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1750 pi2_scratch += 8; 1751 1752 } 1753 1754 } 1755 1756 } 1757 /* All e[] are done */ 1758 /****************************/ 1759 1760 { 1761 1762 WORD16 *pi2_tmp_src = pi2_src + src_strd; 1763 1764 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 1765 pi2_tmp_src += (src_strd << 1); 1766 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 1767 pi2_tmp_src += (src_strd << 1); 1768 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 1769 pi2_tmp_src += (src_strd << 1); 1770 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 1771 pi2_tmp_src += (src_strd << 1); 1772 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 1773 pi2_tmp_src += (src_strd << 1); 1774 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 1775 pi2_tmp_src += (src_strd << 1); 1776 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 1777 pi2_tmp_src += (src_strd << 1); 1778 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 1779 pi2_tmp_src += (src_strd << 1); 1780 1781 m_temp_reg_80 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 1782 pi2_tmp_src += (src_strd << 1); 1783 m_temp_reg_81 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 1784 pi2_tmp_src += (src_strd << 1); 1785 m_temp_reg_82 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 1786 pi2_tmp_src += (src_strd << 1); 1787 m_temp_reg_83 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 1788 pi2_tmp_src += (src_strd << 1); 1789 m_temp_reg_84 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 1790 pi2_tmp_src += (src_strd << 1); 1791 m_temp_reg_85 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 1792 pi2_tmp_src += (src_strd << 1); 1793 m_temp_reg_86 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 1794 pi2_tmp_src += (src_strd << 1); 1795 m_temp_reg_87 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 1796 } 1797 1798 if(zero_last28_rows_stg1) 1799 { 1800 /* o & stage 1 out */ 1801 { 1802 WORD32 j; 1803 WORD16 *pi2_src_scratch = o_temp_ptr; 1804 WORD16 *pi2_dst_scratch = temp_ptr; 1805 WORD32 out_stride = (trans_size << 1); 1806 WORD32 in_stride = trans_size; 1807 1808 for(j = 0; j < 2; j++) 1809 { 1810 if(j) 1811 { 1812 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 1813 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 1814 } 1815 1816 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); 1817 1818 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 1819 1820 /* o0[0-3] */ 1821 { 1822 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1823 1824 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1825 pi2_src_scratch += in_stride; 1826 1827 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1828 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1829 1830 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1831 m_count = _mm_cvtsi32_si128(i4_shift); 1832 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 1833 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 1834 1835 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1836 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1837 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1838 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1839 1840 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1841 1842 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1843 pi2_dst_scratch += out_stride; 1844 1845 } 1846 1847 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 1848 1849 /* o1[0-3] */ 1850 { 1851 1852 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1853 1854 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1855 pi2_src_scratch += in_stride; 1856 1857 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1858 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1859 1860 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1861 m_count = _mm_cvtsi32_si128(i4_shift); 1862 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 1863 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 1864 1865 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1866 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1867 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1868 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1869 1870 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1871 1872 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1873 pi2_dst_scratch += out_stride; 1874 1875 } 1876 1877 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 1878 1879 /* o2[0-3] */ 1880 { 1881 1882 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1883 1884 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1885 pi2_src_scratch += in_stride; 1886 1887 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1888 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1889 1890 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1891 m_count = _mm_cvtsi32_si128(i4_shift); 1892 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 1893 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 1894 1895 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1896 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1897 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1898 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1899 1900 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1901 1902 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1903 pi2_dst_scratch += out_stride; 1904 1905 } 1906 1907 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 1908 1909 /* o3[0-3] */ 1910 { 1911 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1912 1913 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1914 pi2_src_scratch += in_stride; 1915 1916 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1917 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1918 1919 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1920 m_count = _mm_cvtsi32_si128(i4_shift); 1921 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 1922 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 1923 1924 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1925 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1926 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1927 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1928 1929 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1930 1931 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1932 pi2_dst_scratch += out_stride; 1933 1934 } 1935 1936 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 1937 1938 /* o4[0-3] */ 1939 { 1940 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1941 1942 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1943 pi2_src_scratch += in_stride; 1944 1945 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1946 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1947 1948 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1949 m_count = _mm_cvtsi32_si128(i4_shift); 1950 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 1951 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 1952 1953 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1954 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1955 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1956 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1957 1958 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1959 1960 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1961 pi2_dst_scratch += out_stride; 1962 1963 } 1964 1965 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 1966 1967 /* o5[0-3] */ 1968 { 1969 1970 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1971 1972 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1973 pi2_src_scratch += in_stride; 1974 1975 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1976 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1977 1978 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1979 m_count = _mm_cvtsi32_si128(i4_shift); 1980 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 1981 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 1982 1983 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1984 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1985 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1986 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1987 1988 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1989 1990 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1991 pi2_dst_scratch += out_stride; 1992 1993 } 1994 1995 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 1996 1997 /* o6[0-3] */ 1998 { 1999 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2000 2001 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2002 pi2_src_scratch += in_stride; 2003 2004 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2005 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2006 2007 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2008 m_count = _mm_cvtsi32_si128(i4_shift); 2009 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2010 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2011 2012 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2013 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2014 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2015 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2016 2017 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2018 2019 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2020 pi2_dst_scratch += out_stride; 2021 2022 } 2023 2024 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 2025 2026 /* o7[0-3] */ 2027 { 2028 2029 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2030 2031 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2032 pi2_src_scratch += 8; 2033 2034 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2035 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2036 2037 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2038 m_count = _mm_cvtsi32_si128(i4_shift); 2039 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2040 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2041 2042 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2043 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2044 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2045 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2046 2047 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2048 2049 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2050 pi2_dst_scratch += 8; 2051 2052 } 2053 2054 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 2055 2056 /* o8[0-3] */ 2057 { 2058 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2059 2060 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2061 pi2_src_scratch -= in_stride; 2062 2063 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2064 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2065 2066 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2067 m_count = _mm_cvtsi32_si128(i4_shift); 2068 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2069 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2070 2071 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2072 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2073 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2074 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2075 2076 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2077 2078 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2079 pi2_dst_scratch -= out_stride; 2080 } 2081 2082 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 2083 2084 /* o9[0-3] */ 2085 { 2086 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2087 2088 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2089 pi2_src_scratch -= in_stride; 2090 2091 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2092 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2093 2094 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2095 m_count = _mm_cvtsi32_si128(i4_shift); 2096 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2097 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2098 2099 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2100 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2101 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2102 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2103 2104 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2105 2106 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2107 pi2_dst_scratch -= out_stride; 2108 } 2109 2110 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 2111 2112 /* o10[0-3] */ 2113 { 2114 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2115 2116 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2117 pi2_src_scratch -= in_stride; 2118 2119 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2120 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2121 2122 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2123 m_count = _mm_cvtsi32_si128(i4_shift); 2124 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2125 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2126 2127 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2128 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2129 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2130 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2131 2132 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2133 2134 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2135 pi2_dst_scratch -= out_stride; 2136 } 2137 2138 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 2139 2140 /* o11[0-3] */ 2141 { 2142 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2143 2144 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2145 pi2_src_scratch -= in_stride; 2146 2147 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2148 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2149 2150 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2151 m_count = _mm_cvtsi32_si128(i4_shift); 2152 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2153 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2154 2155 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2156 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2157 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2158 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2159 2160 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2161 2162 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2163 pi2_dst_scratch -= out_stride; 2164 2165 } 2166 2167 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 2168 2169 /* o12[0-3] */ 2170 { 2171 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2172 2173 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2174 pi2_src_scratch -= in_stride; 2175 2176 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2177 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2178 2179 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2180 m_count = _mm_cvtsi32_si128(i4_shift); 2181 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2182 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2183 2184 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2185 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2186 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2187 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2188 2189 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2190 2191 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2192 pi2_dst_scratch -= out_stride; 2193 2194 } 2195 2196 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 2197 2198 /* o13[0-3] */ 2199 { 2200 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2201 2202 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2203 pi2_src_scratch -= in_stride; 2204 2205 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2206 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2207 2208 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2209 m_count = _mm_cvtsi32_si128(i4_shift); 2210 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2211 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2212 2213 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2214 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2215 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2216 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2217 2218 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2219 2220 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2221 pi2_dst_scratch -= out_stride; 2222 } 2223 2224 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 2225 2226 /* o14[0-3] */ 2227 { 2228 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2229 2230 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2231 pi2_src_scratch -= in_stride; 2232 2233 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2234 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2235 2236 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2237 m_count = _mm_cvtsi32_si128(i4_shift); 2238 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2239 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2240 2241 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2242 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2243 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2244 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2245 2246 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2247 2248 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2249 pi2_dst_scratch -= out_stride; 2250 2251 } 2252 2253 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 2254 2255 /* o15[0-3] */ 2256 { 2257 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2258 2259 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2260 pi2_src_scratch += 8; 2261 2262 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2263 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2264 2265 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2266 m_count = _mm_cvtsi32_si128(i4_shift); 2267 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2268 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2269 2270 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2271 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2272 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2273 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2274 2275 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2276 2277 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2278 pi2_dst_scratch += 8; 2279 } 2280 2281 } 2282 } 2283 } 2284 else if(zero_last24_rows_stg1) 2285 { 2286 /* o & stage 1 out */ 2287 { 2288 WORD32 j; 2289 2290 WORD16 *pi2_src_scratch = o_temp_ptr; 2291 WORD16 *pi2_dst_scratch = temp_ptr; 2292 WORD32 out_stride = (trans_size << 1); 2293 2294 WORD32 in_stride = trans_size; 2295 2296 for(j = 0; j < 2; j++) 2297 { 2298 if(j) 2299 { 2300 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 2301 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 2302 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8); 2303 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8); 2304 } 2305 2306 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved 2307 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved 2308 2309 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 2310 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]); 2311 2312 /* o0[0-3] */ 2313 { 2314 2315 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2316 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2317 2318 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2319 2320 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2321 pi2_src_scratch += in_stride; 2322 2323 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2324 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2325 2326 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2327 m_count = _mm_cvtsi32_si128(i4_shift); 2328 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2329 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2330 2331 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2332 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2333 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2334 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2335 2336 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2337 2338 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2339 pi2_dst_scratch += out_stride; 2340 2341 } 2342 2343 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 2344 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]); 2345 2346 /* o1[0-3] */ 2347 { 2348 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2349 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2350 2351 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2352 2353 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2354 pi2_src_scratch += in_stride; 2355 2356 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2357 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2358 2359 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2360 m_count = _mm_cvtsi32_si128(i4_shift); 2361 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2362 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2363 2364 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2365 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2366 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2367 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2368 2369 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2370 2371 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2372 pi2_dst_scratch += out_stride; 2373 2374 } 2375 2376 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 2377 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]); 2378 2379 /* o2[0-3] */ 2380 { 2381 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2382 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2383 2384 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 2385 2386 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2387 pi2_src_scratch += in_stride; 2388 2389 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2390 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2391 2392 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2393 m_count = _mm_cvtsi32_si128(i4_shift); 2394 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2395 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2396 2397 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2398 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2399 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2400 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2401 2402 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2403 2404 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2405 pi2_dst_scratch += out_stride; 2406 2407 } 2408 2409 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 2410 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]); 2411 2412 /* o3[0-3] */ 2413 { 2414 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2415 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2416 2417 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 2418 2419 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2420 pi2_src_scratch += in_stride; 2421 2422 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2423 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2424 2425 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2426 m_count = _mm_cvtsi32_si128(i4_shift); 2427 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2428 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2429 2430 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2431 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2432 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2433 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2434 2435 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2436 2437 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2438 pi2_dst_scratch += out_stride; 2439 2440 } 2441 2442 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 2443 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]); 2444 2445 /* o4[0-3] */ 2446 { 2447 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2448 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2449 2450 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2451 2452 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2453 pi2_src_scratch += in_stride; 2454 2455 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2456 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2457 2458 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2459 m_count = _mm_cvtsi32_si128(i4_shift); 2460 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2461 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2462 2463 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2464 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2465 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2466 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2467 2468 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2469 2470 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2471 pi2_dst_scratch += out_stride; 2472 2473 } 2474 2475 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 2476 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]); 2477 2478 /* o5[0-3] */ 2479 { 2480 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2481 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2482 2483 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2484 2485 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2486 pi2_src_scratch += in_stride; 2487 2488 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2489 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2490 2491 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2492 m_count = _mm_cvtsi32_si128(i4_shift); 2493 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2494 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2495 2496 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2497 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2498 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2499 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2500 2501 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2502 2503 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2504 pi2_dst_scratch += out_stride; 2505 2506 } 2507 2508 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 2509 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]); 2510 2511 /* o6[0-3] */ 2512 { 2513 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2514 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2515 2516 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2517 2518 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2519 pi2_src_scratch += in_stride; 2520 2521 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2522 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2523 2524 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2525 m_count = _mm_cvtsi32_si128(i4_shift); 2526 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2527 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2528 2529 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2530 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2531 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2532 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2533 2534 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2535 2536 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2537 pi2_dst_scratch += out_stride; 2538 2539 } 2540 2541 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 2542 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]); 2543 2544 /* o7[0-3] */ 2545 { 2546 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2547 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2548 2549 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2550 2551 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2552 pi2_src_scratch += 8; 2553 2554 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2555 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2556 2557 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2558 m_count = _mm_cvtsi32_si128(i4_shift); 2559 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2560 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2561 2562 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2563 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2564 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2565 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2566 2567 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2568 2569 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2570 pi2_dst_scratch += 8; 2571 2572 } 2573 2574 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 2575 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]); 2576 2577 /* o8[0-3] */ 2578 { 2579 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2580 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2581 2582 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2583 2584 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2585 pi2_src_scratch -= in_stride; 2586 2587 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2588 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2589 2590 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2591 m_count = _mm_cvtsi32_si128(i4_shift); 2592 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2593 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2594 2595 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2596 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2597 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2598 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2599 2600 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2601 2602 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2603 pi2_dst_scratch -= out_stride; 2604 } 2605 2606 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 2607 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]); 2608 2609 /* o9[0-3] */ 2610 { 2611 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2612 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2613 2614 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2615 2616 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2617 pi2_src_scratch -= in_stride; 2618 2619 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2620 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2621 2622 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2623 m_count = _mm_cvtsi32_si128(i4_shift); 2624 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2625 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2626 2627 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2628 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2629 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2630 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2631 2632 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2633 2634 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2635 pi2_dst_scratch -= out_stride; 2636 } 2637 2638 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 2639 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]); 2640 2641 /* o10[0-3] */ 2642 { 2643 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2644 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2645 2646 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2647 2648 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2649 pi2_src_scratch -= in_stride; 2650 2651 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2652 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2653 2654 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2655 m_count = _mm_cvtsi32_si128(i4_shift); 2656 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2657 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2658 2659 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2660 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2661 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2662 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2663 2664 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2665 2666 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2667 pi2_dst_scratch -= out_stride; 2668 } 2669 2670 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 2671 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]); 2672 2673 /* o11[0-3] */ 2674 { 2675 2676 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2677 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2678 2679 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2680 2681 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2682 pi2_src_scratch -= in_stride; 2683 2684 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2685 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2686 2687 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2688 m_count = _mm_cvtsi32_si128(i4_shift); 2689 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2690 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2691 2692 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2693 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2694 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2695 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2696 2697 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2698 2699 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2700 pi2_dst_scratch -= out_stride; 2701 2702 } 2703 2704 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 2705 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]); 2706 2707 /* o12[0-3] */ 2708 { 2709 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2710 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2711 2712 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2713 2714 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2715 pi2_src_scratch -= in_stride; 2716 2717 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2718 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2719 2720 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2721 m_count = _mm_cvtsi32_si128(i4_shift); 2722 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2723 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2724 2725 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2726 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2727 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2728 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2729 2730 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2731 2732 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2733 pi2_dst_scratch -= out_stride; 2734 2735 } 2736 2737 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 2738 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]); 2739 2740 /* o13[0-3] */ 2741 { 2742 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2743 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2744 2745 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2746 2747 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2748 pi2_src_scratch -= in_stride; 2749 2750 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2751 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2752 2753 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2754 m_count = _mm_cvtsi32_si128(i4_shift); 2755 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2756 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2757 2758 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2759 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2760 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2761 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2762 2763 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2764 2765 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2766 pi2_dst_scratch -= out_stride; 2767 } 2768 2769 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 2770 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]); 2771 2772 /* o14[0-3] */ 2773 { 2774 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2775 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2776 2777 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2778 2779 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2780 pi2_src_scratch -= in_stride; 2781 2782 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2783 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2784 2785 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2786 m_count = _mm_cvtsi32_si128(i4_shift); 2787 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2788 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2789 2790 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2791 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2792 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2793 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2794 2795 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2796 2797 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2798 pi2_dst_scratch -= out_stride; 2799 2800 } 2801 2802 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 2803 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]); 2804 2805 /* o15[0-3] */ 2806 { 2807 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2808 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2809 2810 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2811 2812 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2813 pi2_src_scratch += 8; 2814 2815 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2816 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2817 2818 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2819 m_count = _mm_cvtsi32_si128(i4_shift); 2820 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2821 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2822 2823 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2824 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2825 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2826 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2827 2828 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2829 2830 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2831 pi2_dst_scratch += 8; 2832 } 2833 2834 } 2835 } 2836 } 2837 else 2838 { 2839 /* o & stage 1 out */ 2840 { 2841 WORD32 j; 2842 2843 WORD16 *pi2_src_scratch = o_temp_ptr; 2844 WORD16 *pi2_dst_scratch = temp_ptr; 2845 WORD32 out_stride = (trans_size << 1); 2846 2847 WORD32 in_stride = trans_size; 2848 2849 2850 for(j = 0; j < 2; j++) 2851 { 2852 if(j) 2853 { 2854 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 2855 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 2856 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8); 2857 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8); 2858 m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8); 2859 m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8); 2860 m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8); 2861 m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8); 2862 2863 m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8); 2864 m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8); 2865 m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8); 2866 m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8); 2867 m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8); 2868 m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8); 2869 m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8); 2870 m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8); 2871 } 2872 2873 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 2874 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]); 2875 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]); 2876 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]); 2877 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]); 2878 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]); 2879 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]); 2880 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]); 2881 2882 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved 2883 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved 2884 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved 2885 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved 2886 temp1 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved 2887 temp2 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved 2888 temp3 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved 2889 temp4 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved 2890 2891 2892 /* o0[0-3] */ 2893 { 2894 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2895 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2896 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 2897 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 2898 2899 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2900 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 2901 2902 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 2903 2904 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 2905 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 2906 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 2907 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 2908 2909 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 2910 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 2911 2912 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 2913 2914 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 2915 2916 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2917 pi2_src_scratch += in_stride; 2918 2919 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2920 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2921 2922 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2923 m_count = _mm_cvtsi32_si128(i4_shift); 2924 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2925 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2926 2927 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2928 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2929 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2930 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2931 2932 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2933 2934 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2935 pi2_dst_scratch += out_stride; 2936 2937 } 2938 2939 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 2940 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]); 2941 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]); 2942 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]); 2943 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]); 2944 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]); 2945 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]); 2946 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]); 2947 2948 2949 /* o1[0-3] */ 2950 { 2951 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2952 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2953 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 2954 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 2955 2956 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2957 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 2958 2959 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20); 2960 2961 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 2962 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 2963 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 2964 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 2965 2966 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 2967 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 2968 2969 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 2970 2971 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 2972 2973 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2974 pi2_src_scratch += in_stride; 2975 2976 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2977 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2978 2979 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2980 m_count = _mm_cvtsi32_si128(i4_shift); 2981 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2982 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2983 2984 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2985 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2986 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2987 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2988 2989 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2990 2991 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2992 pi2_dst_scratch += out_stride; 2993 2994 } 2995 2996 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 2997 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]); 2998 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]); 2999 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]); 3000 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]); 3001 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]); 3002 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]); 3003 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]); 3004 3005 /* o2[0-3] */ 3006 { 3007 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3008 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3009 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3010 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3011 3012 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 3013 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3014 3015 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3016 3017 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3018 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3019 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3020 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3021 3022 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41); 3023 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3024 3025 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42); 3026 3027 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3028 3029 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3030 pi2_src_scratch += in_stride; 3031 3032 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3033 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3034 3035 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3036 m_count = _mm_cvtsi32_si128(i4_shift); 3037 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3038 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3039 3040 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3041 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3042 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3043 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3044 3045 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3046 3047 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3048 pi2_dst_scratch += out_stride; 3049 3050 } 3051 3052 3053 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 3054 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]); 3055 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]); 3056 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]); 3057 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]); 3058 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]); 3059 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]); 3060 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]); 3061 3062 /* o3[0-3] */ 3063 { 3064 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3065 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3066 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3067 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3068 3069 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 3070 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3071 3072 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3073 3074 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3075 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3076 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3077 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3078 3079 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40); 3080 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3081 3082 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3083 3084 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3085 3086 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3087 pi2_src_scratch += in_stride; 3088 3089 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3090 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3091 3092 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3093 m_count = _mm_cvtsi32_si128(i4_shift); 3094 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3095 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3096 3097 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3098 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3099 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3100 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3101 3102 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3103 3104 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3105 pi2_dst_scratch += out_stride; 3106 3107 } 3108 3109 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 3110 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]); 3111 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]); 3112 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]); 3113 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]); 3114 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]); 3115 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]); 3116 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]); 3117 3118 /* o4[0-3] */ 3119 { 3120 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3121 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3122 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3123 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3124 3125 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3126 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3127 3128 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3129 3130 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3131 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3132 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3133 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3134 3135 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3136 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3137 3138 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3139 3140 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3141 3142 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3143 pi2_src_scratch += in_stride; 3144 3145 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3146 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3147 3148 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3149 m_count = _mm_cvtsi32_si128(i4_shift); 3150 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3151 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3152 3153 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3154 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3155 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3156 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3157 3158 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3159 3160 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3161 pi2_dst_scratch += out_stride; 3162 3163 } 3164 3165 3166 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 3167 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]); 3168 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]); 3169 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]); 3170 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]); 3171 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]); 3172 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]); 3173 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]); 3174 3175 /* o5[0-3] */ 3176 { 3177 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3178 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3179 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3180 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3181 3182 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3183 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3184 3185 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3186 3187 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3188 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3189 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3190 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3191 3192 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3193 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3194 3195 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3196 3197 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3198 3199 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3200 pi2_src_scratch += in_stride; 3201 3202 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3203 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3204 3205 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3206 m_count = _mm_cvtsi32_si128(i4_shift); 3207 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3208 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3209 3210 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3211 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3212 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3213 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3214 3215 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3216 3217 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3218 pi2_dst_scratch += out_stride; 3219 3220 } 3221 3222 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 3223 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]); 3224 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]); 3225 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]); 3226 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]); 3227 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]); 3228 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]); 3229 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]); 3230 3231 3232 /* o6[0-3] */ 3233 { 3234 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3235 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3236 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3237 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3238 3239 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3240 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3241 3242 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3243 3244 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3245 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3246 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3247 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3248 3249 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3250 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3251 3252 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3253 3254 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3255 3256 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3257 pi2_src_scratch += in_stride; 3258 3259 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3260 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3261 3262 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3263 m_count = _mm_cvtsi32_si128(i4_shift); 3264 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3265 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3266 3267 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3268 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3269 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3270 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3271 3272 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3273 3274 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3275 pi2_dst_scratch += out_stride; 3276 3277 } 3278 3279 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 3280 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]); 3281 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]); 3282 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]); 3283 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]); 3284 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]); 3285 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]); 3286 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]); 3287 3288 /* o7[0-3] */ 3289 { 3290 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3291 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3292 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3293 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3294 3295 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3296 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3297 3298 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3299 3300 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3301 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3302 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3303 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3304 3305 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3306 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3307 3308 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3309 3310 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3311 3312 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3313 pi2_src_scratch += 8; 3314 3315 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3316 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3317 3318 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3319 m_count = _mm_cvtsi32_si128(i4_shift); 3320 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3321 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3322 3323 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3324 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3325 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3326 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3327 3328 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3329 3330 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3331 pi2_dst_scratch += 8; 3332 3333 } 3334 3335 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 3336 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]); 3337 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]); 3338 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]); 3339 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]); 3340 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]); 3341 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]); 3342 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]); 3343 3344 3345 /* o8[0-3] */ 3346 { 3347 3348 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3349 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3350 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3351 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3352 3353 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3354 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3355 3356 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3357 3358 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3359 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3360 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3361 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3362 3363 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3364 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3365 3366 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3367 3368 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3369 3370 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3371 pi2_src_scratch -= in_stride; 3372 3373 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3374 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3375 3376 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3377 m_count = _mm_cvtsi32_si128(i4_shift); 3378 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3379 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3380 3381 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3382 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3383 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3384 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3385 3386 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3387 3388 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3389 pi2_dst_scratch -= out_stride; 3390 } 3391 3392 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 3393 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]); 3394 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]); 3395 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]); 3396 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]); 3397 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]); 3398 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]); 3399 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]); 3400 3401 3402 /* o9[0-3] */ 3403 { 3404 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3405 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3406 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3407 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3408 3409 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3410 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3411 3412 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3413 3414 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3415 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3416 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3417 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3418 3419 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3420 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3421 3422 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3423 3424 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3425 3426 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3427 pi2_src_scratch -= in_stride; 3428 3429 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3430 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3431 3432 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3433 m_count = _mm_cvtsi32_si128(i4_shift); 3434 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3435 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3436 3437 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3438 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3439 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3440 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3441 3442 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3443 3444 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3445 pi2_dst_scratch -= out_stride; 3446 } 3447 3448 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 3449 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]); 3450 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]); 3451 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]); 3452 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]); 3453 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]); 3454 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]); 3455 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]); 3456 3457 /* o10[0-3] */ 3458 { 3459 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3460 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3461 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3462 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3463 3464 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3465 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3466 3467 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3468 3469 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3470 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3471 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3472 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3473 3474 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3475 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3476 3477 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3478 3479 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3480 3481 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3482 pi2_src_scratch -= in_stride; 3483 3484 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3485 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3486 3487 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3488 m_count = _mm_cvtsi32_si128(i4_shift); 3489 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3490 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3491 3492 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3493 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3494 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3495 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3496 3497 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3498 3499 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3500 pi2_dst_scratch -= out_stride; 3501 } 3502 3503 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 3504 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]); 3505 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]); 3506 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]); 3507 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]); 3508 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]); 3509 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]); 3510 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]); 3511 3512 /* o11[0-3] */ 3513 { 3514 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3515 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3516 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3517 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3518 3519 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3520 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3521 3522 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3523 3524 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3525 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3526 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3527 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3528 3529 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3530 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3531 3532 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3533 3534 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3535 3536 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3537 pi2_src_scratch -= in_stride; 3538 3539 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3540 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3541 3542 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3543 m_count = _mm_cvtsi32_si128(i4_shift); 3544 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3545 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3546 3547 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3548 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3549 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3550 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3551 3552 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3553 3554 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3555 pi2_dst_scratch -= out_stride; 3556 3557 } 3558 3559 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 3560 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]); 3561 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]); 3562 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]); 3563 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]); 3564 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]); 3565 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]); 3566 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]); 3567 3568 3569 /* o12[0-3] */ 3570 { 3571 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3572 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3573 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3574 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3575 3576 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3577 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3578 3579 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3580 3581 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3582 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3583 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3584 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3585 3586 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3587 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3588 3589 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3590 3591 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3592 3593 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3594 pi2_src_scratch -= in_stride; 3595 3596 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3597 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3598 3599 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3600 m_count = _mm_cvtsi32_si128(i4_shift); 3601 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3602 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3603 3604 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3605 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3606 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3607 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3608 3609 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3610 3611 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3612 pi2_dst_scratch -= out_stride; 3613 3614 } 3615 3616 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 3617 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]); 3618 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]); 3619 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]); 3620 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]); 3621 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]); 3622 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]); 3623 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]); 3624 3625 3626 /* o13[0-3] */ 3627 { 3628 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3629 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3630 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3631 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3632 3633 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3634 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3635 3636 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3637 3638 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3639 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3640 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3641 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3642 3643 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3644 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3645 3646 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3647 3648 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3649 3650 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3651 pi2_src_scratch -= in_stride; 3652 3653 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3654 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3655 3656 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3657 m_count = _mm_cvtsi32_si128(i4_shift); 3658 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3659 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3660 3661 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3662 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3663 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3664 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3665 3666 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3667 3668 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3669 pi2_dst_scratch -= out_stride; 3670 } 3671 3672 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 3673 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]); 3674 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]); 3675 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]); 3676 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]); 3677 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]); 3678 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]); 3679 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]); 3680 3681 3682 /* o14[0-3] */ 3683 { 3684 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3685 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3686 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3687 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3688 3689 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3690 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3691 3692 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3693 3694 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3695 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3696 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3697 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3698 3699 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3700 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3701 3702 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3703 3704 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3705 3706 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3707 pi2_src_scratch -= in_stride; 3708 3709 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3710 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3711 3712 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3713 m_count = _mm_cvtsi32_si128(i4_shift); 3714 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3715 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3716 3717 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3718 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3719 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3720 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3721 3722 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3723 3724 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3725 pi2_dst_scratch -= out_stride; 3726 3727 } 3728 3729 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 3730 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]); 3731 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]); 3732 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]); 3733 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]); 3734 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]); 3735 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]); 3736 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]); 3737 3738 /* o15[0-3] */ 3739 { 3740 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3741 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3742 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3743 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3744 3745 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3746 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3747 3748 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3749 3750 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3751 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3752 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3753 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3754 3755 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3756 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3757 3758 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3759 3760 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3761 3762 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3763 pi2_src_scratch += 8; 3764 3765 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3766 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3767 3768 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3769 m_count = _mm_cvtsi32_si128(i4_shift); 3770 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3771 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3772 3773 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3774 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3775 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3776 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3777 3778 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3779 3780 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3781 pi2_dst_scratch += 8; 3782 } 3783 3784 } 3785 } 3786 } 3787 /* Transpose */ 3788 { 3789 WORD16 *pi2_src_scratch = temp_ptr; 3790 WORD16 *pi2_dst_scratch = pi2_tmp; 3791 WORD32 in_stride = (trans_size << 1); 3792 3793 for(j = 0; j < 2; j++) 3794 { 3795 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3796 pi2_src_scratch += in_stride; 3797 m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch); 3798 pi2_src_scratch += in_stride; 3799 m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch); 3800 pi2_src_scratch += in_stride; 3801 m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch); 3802 pi2_src_scratch += in_stride; 3803 m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch); 3804 pi2_src_scratch += in_stride; 3805 m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch); 3806 pi2_src_scratch += in_stride; 3807 m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch); 3808 pi2_src_scratch += in_stride; 3809 m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch); 3810 pi2_src_scratch += 8; 3811 3812 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch); 3813 pi2_src_scratch -= in_stride; 3814 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch); 3815 pi2_src_scratch -= in_stride; 3816 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch); 3817 pi2_src_scratch -= in_stride; 3818 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch); 3819 pi2_src_scratch -= in_stride; 3820 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch); 3821 pi2_src_scratch -= in_stride; 3822 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch); 3823 pi2_src_scratch -= in_stride; 3824 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch); 3825 pi2_src_scratch -= in_stride; 3826 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch); 3827 pi2_src_scratch += 8; 3828 3829 3830 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); 3831 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); 3832 3833 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); 3834 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); 3835 3836 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); 3837 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); 3838 3839 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); 3840 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); 3841 3842 m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); 3843 m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70); 3844 3845 m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); 3846 m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72); 3847 3848 m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); 3849 m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74); 3850 3851 m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); 3852 m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76); 3853 3854 /****************/ 3855 3856 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); 3857 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); 3858 3859 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); 3860 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); 3861 3862 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82); 3863 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82); 3864 3865 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86); 3866 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86); 3867 3868 m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); 3869 m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); 3870 3871 m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); 3872 m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); 3873 3874 m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81); 3875 m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81); 3876 3877 m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85); 3878 m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85); 3879 3880 /******************/ 3881 3882 m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2); 3883 m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2); 3884 3885 m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90); 3886 m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90); 3887 3888 m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6); 3889 m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6); 3890 3891 m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94); 3892 m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94); 3893 3894 m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3); 3895 m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3); 3896 3897 m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91); 3898 m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91); 3899 3900 m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7); 3901 m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7); 3902 3903 m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95); 3904 m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95); 3905 3906 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_30); 3907 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_34); 3908 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_36); 3909 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_32); 3910 3911 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_31); 3912 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_35); 3913 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_37); 3914 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_33); 3915 3916 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_80); 3917 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_84); 3918 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_86); 3919 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_82); 3920 3921 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_81); 3922 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_85); 3923 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_87); 3924 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_83); 3925 3926 pi2_dst_scratch += 4 * trans_size; 3927 } 3928 } 3929 pi2_src += 8; 3930 // pi2_dequant_coeff +=8; 3931 pi2_tmp += 8 * trans_size; 3932 zero_cols = zero_cols >> 1; 3933 } 3934 3935 if(trans_size_stg1 != TRANS_SIZE_32) 3936 { 3937 m_temp_reg_10 = _mm_setzero_si128(); 3938 3939 for(i = trans_size_stg1; i < 32; i += 8) 3940 { 3941 WORD16 *pi2_dst_scratch = pi2_tmp; 3942 3943 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_10); 3944 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_10); 3945 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_10); 3946 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_10); 3947 3948 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_10); 3949 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_10); 3950 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_10); 3951 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_10); 3952 3953 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_10); 3954 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_10); 3955 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_10); 3956 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_10); 3957 3958 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_10); 3959 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_10); 3960 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_10); 3961 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_10); 3962 3963 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size), m_temp_reg_10); 3964 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 8), m_temp_reg_10); 3965 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 16), m_temp_reg_10); 3966 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 24), m_temp_reg_10); 3967 3968 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size), m_temp_reg_10); 3969 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 8), m_temp_reg_10); 3970 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 16), m_temp_reg_10); 3971 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 24), m_temp_reg_10); 3972 3973 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size), m_temp_reg_10); 3974 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 8), m_temp_reg_10); 3975 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 16), m_temp_reg_10); 3976 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 24), m_temp_reg_10); 3977 3978 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size), m_temp_reg_10); 3979 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 8), m_temp_reg_10); 3980 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 16), m_temp_reg_10); 3981 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 24), m_temp_reg_10); 3982 3983 pi2_tmp += 8 * trans_size; 3984 } 3985 } 3986 3987 pi2_tmp = pi2_tmp_orig; 3988 3989 /* Inverse Transform 2nd stage */ 3990 3991 3992 for(j = 0; j < trans_size; j += 4) 3993 { 3994 i4_shift = IT_SHIFT_STAGE_2; 3995 3996 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ 3997 if(zero_last28_rows_stg2) 3998 { 3999 { 4000 4001 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87 4002 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 4003 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 4004 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 4005 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 4006 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 4007 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 4008 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9 4009 4010 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]); 4011 4012 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, all_zero_reg); 4013 4014 /* eo0[0-3] */ 4015 { 4016 m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4017 4018 } 4019 /* eo1[0-3] */ 4020 { 4021 m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff2); 4022 4023 } 4024 /* eo2[0-3] */ 4025 { 4026 m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 4027 } 4028 4029 /* eo3[0-3] */ 4030 { 4031 m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff4); 4032 } 4033 /* eo4[0-3] */ 4034 { 4035 m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 4036 } 4037 4038 /* eo5[0-3] */ 4039 { 4040 m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff6); 4041 } 4042 4043 /* eo6[0-3] */ 4044 { 4045 m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff7); 4046 } 4047 /* eo7[0-3] */ 4048 { 4049 m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff8); 4050 } 4051 } 4052 4053 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 4054 4055 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]); 4056 4057 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 4058 4059 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 4060 4061 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 4062 4063 /* e[]*/ 4064 4065 temp1 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_90); /* ee[0] */ 4066 temp2 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_90); /* ee[15] */ 4067 4068 temp3 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_91); /* ee[1] */ 4069 temp4 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_91); /* ee[14] */ 4070 4071 temp5 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_92); /* ee[2] */ 4072 temp6 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_92); /* ee[13] */ 4073 4074 temp7 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_93); /* ee[3] */ 4075 temp8 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_93); /* ee[12] */ 4076 4077 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_94); /* ee[4] */ 4078 m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_94); /* ee[11] */ 4079 4080 m_temp_reg_92 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_95); /* ee[5] */ 4081 m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_95); /* ee[10] */ 4082 4083 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_96); /* ee[6] */ 4084 m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_96); /* ee[9] */ 4085 4086 m_temp_reg_96 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_97); /* ee[7] */ 4087 m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_97); /* ee[8] */ 4088 4089 /*o[k]*/ 4090 { 4091 4092 WORD16 *pi2_dst_scratch = temp_ptr; 4093 WORD32 out_stride = 8; 4094 4095 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 4096 4097 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]); 4098 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]); 4099 4100 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved 4101 4102 4103 /* o0[0-3] */ 4104 { 4105 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4106 4107 m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20); 4108 m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20); 4109 4110 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4111 m_count = _mm_cvtsi32_si128(i4_shift); 4112 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4113 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4114 4115 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4116 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4117 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4118 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4119 4120 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4121 4122 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4123 pi2_dst_scratch += out_stride; 4124 4125 } 4126 4127 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 4128 4129 /* o1[0-3] */ 4130 { 4131 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4132 4133 m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20); 4134 m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20); 4135 4136 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4137 m_count = _mm_cvtsi32_si128(i4_shift); 4138 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4139 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4140 4141 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4142 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4143 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4144 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4145 4146 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4147 4148 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4149 pi2_dst_scratch += out_stride; 4150 4151 } 4152 4153 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 4154 4155 /* o2[0-3] */ 4156 { 4157 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4158 4159 m_temp_reg_31 = _mm_sub_epi32(temp5, m_temp_reg_20); 4160 m_temp_reg_30 = _mm_add_epi32(temp5, m_temp_reg_20); 4161 4162 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4163 m_count = _mm_cvtsi32_si128(i4_shift); 4164 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4165 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4166 4167 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4168 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4169 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4170 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4171 4172 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4173 4174 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4175 pi2_dst_scratch += out_stride; 4176 4177 } 4178 4179 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 4180 4181 /* o3[0-3] */ 4182 { 4183 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4184 4185 m_temp_reg_31 = _mm_sub_epi32(temp7, m_temp_reg_20); 4186 m_temp_reg_30 = _mm_add_epi32(temp7, m_temp_reg_20); 4187 4188 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4189 m_count = _mm_cvtsi32_si128(i4_shift); 4190 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4191 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4192 4193 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4194 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4195 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4196 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4197 4198 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4199 4200 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4201 pi2_dst_scratch += out_stride; 4202 4203 } 4204 4205 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 4206 4207 /* o4[0-3] */ 4208 { 4209 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4210 4211 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20); 4212 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20); 4213 4214 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4215 m_count = _mm_cvtsi32_si128(i4_shift); 4216 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4217 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4218 4219 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4220 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4221 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4222 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4223 4224 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4225 4226 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4227 pi2_dst_scratch += out_stride; 4228 4229 } 4230 4231 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 4232 4233 /* o5[0-3] */ 4234 { 4235 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4236 4237 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20); 4238 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20); 4239 4240 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4241 m_count = _mm_cvtsi32_si128(i4_shift); 4242 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4243 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4244 4245 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4246 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4247 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4248 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4249 4250 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4251 4252 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4253 pi2_dst_scratch += out_stride; 4254 4255 } 4256 4257 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 4258 4259 /* o6[0-3] */ 4260 { 4261 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4262 4263 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20); 4264 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20); 4265 4266 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4267 m_count = _mm_cvtsi32_si128(i4_shift); 4268 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4269 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4270 4271 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4272 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4273 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4274 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4275 4276 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4277 4278 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4279 pi2_dst_scratch += out_stride; 4280 4281 } 4282 4283 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 4284 4285 /* o7[0-3] */ 4286 { 4287 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4288 4289 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20); 4290 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20); 4291 4292 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4293 m_count = _mm_cvtsi32_si128(i4_shift); 4294 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4295 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4296 4297 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4298 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4299 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4300 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4301 4302 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4303 4304 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4305 pi2_dst_scratch += 8; 4306 4307 } 4308 4309 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 4310 4311 /* o8[0-3] */ 4312 { 4313 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4314 4315 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20); 4316 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20); 4317 4318 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4319 m_count = _mm_cvtsi32_si128(i4_shift); 4320 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4321 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4322 4323 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4324 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4325 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4326 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4327 4328 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4329 4330 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4331 pi2_dst_scratch += out_stride; 4332 } 4333 4334 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 4335 4336 /* o9[0-3] */ 4337 { 4338 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4339 4340 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20); 4341 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20); 4342 4343 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4344 m_count = _mm_cvtsi32_si128(i4_shift); 4345 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4346 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4347 4348 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4349 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4350 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4351 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4352 4353 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4354 4355 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4356 pi2_dst_scratch += out_stride; 4357 4358 } 4359 4360 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 4361 4362 /* o10[0-3] */ 4363 { 4364 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4365 4366 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20); 4367 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20); 4368 4369 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4370 m_count = _mm_cvtsi32_si128(i4_shift); 4371 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4372 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4373 4374 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4375 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4376 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4377 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4378 4379 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4380 4381 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4382 pi2_dst_scratch += out_stride; 4383 } 4384 4385 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 4386 4387 /* o11[0-3] */ 4388 { 4389 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4390 4391 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20); 4392 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20); 4393 4394 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4395 m_count = _mm_cvtsi32_si128(i4_shift); 4396 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4397 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4398 4399 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4400 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4401 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4402 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4403 4404 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4405 4406 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4407 pi2_dst_scratch += out_stride; 4408 4409 } 4410 4411 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 4412 4413 /* o12[0-3] */ 4414 { 4415 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4416 4417 m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20); 4418 m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20); 4419 4420 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4421 m_count = _mm_cvtsi32_si128(i4_shift); 4422 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4423 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4424 4425 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4426 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4427 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4428 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4429 4430 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4431 4432 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4433 pi2_dst_scratch += out_stride; 4434 4435 } 4436 4437 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 4438 4439 /* o13[0-3] */ 4440 { 4441 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4442 4443 m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20); 4444 m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20); 4445 4446 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4447 m_count = _mm_cvtsi32_si128(i4_shift); 4448 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4449 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4450 4451 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4452 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4453 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4454 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4455 4456 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4457 4458 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4459 pi2_dst_scratch += out_stride; 4460 } 4461 4462 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 4463 4464 /* o14[0-3] */ 4465 { 4466 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4467 4468 m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20); 4469 m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20); 4470 4471 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4472 m_count = _mm_cvtsi32_si128(i4_shift); 4473 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4474 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4475 4476 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4477 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4478 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4479 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4480 4481 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4482 4483 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4484 pi2_dst_scratch += out_stride; 4485 4486 } 4487 4488 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 4489 4490 /* o15[0-3] */ 4491 { 4492 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4493 4494 m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20); 4495 m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20); 4496 4497 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4498 m_count = _mm_cvtsi32_si128(i4_shift); 4499 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4500 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4501 4502 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4503 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4504 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4505 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4506 4507 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4508 4509 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4510 pi2_dst_scratch += 8; 4511 } 4512 4513 } 4514 4515 } 4516 else if(zero_last24_rows_stg2) 4517 { 4518 /* eo */ 4519 { 4520 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87 4521 4522 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]); 4523 m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]); 4524 4525 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11); 4526 4527 4528 /* eo0[0-3] */ 4529 { 4530 m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4531 4532 } 4533 4534 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57 4535 4536 /* eo1[0-3] */ 4537 { 4538 m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4539 4540 } 4541 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9 4542 4543 /* eo2[0-3] */ 4544 { 4545 m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4546 4547 } 4548 4549 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43 4550 4551 /* eo3[0-3] */ 4552 { 4553 4554 m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4555 4556 } 4557 4558 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80 4559 4560 /* eo4[0-3] */ 4561 { 4562 m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4563 4564 } 4565 4566 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90 4567 4568 /* eo5[0-3] */ 4569 { 4570 m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4571 } 4572 4573 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70 4574 /* eo6[0-3] */ 4575 { 4576 m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4577 } 4578 4579 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25 4580 /* eo7[0-3] */ 4581 { 4582 m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4583 4584 } 4585 4586 } 4587 4588 /* eeo */ 4589 { 4590 4591 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75 4592 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 4593 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 4594 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 4595 4596 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]); 4597 4598 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg); 4599 4600 /* eeo0[0-3] */ 4601 { 4602 temp1 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4603 4604 } 4605 4606 /* eeo1[0-3] */ 4607 { 4608 temp2 = _mm_madd_epi16(m_temp_reg_10, m_coeff2); 4609 4610 } 4611 4612 /* eo2[0-3] */ 4613 { 4614 temp3 = _mm_madd_epi16(m_temp_reg_10, m_coeff4); 4615 4616 } 4617 4618 4619 /* eo3[0-3] */ 4620 { 4621 temp4 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 4622 4623 } 4624 4625 } 4626 4627 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 4628 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 4629 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 4630 4631 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]); 4632 4633 //m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_70); 4634 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 4635 4636 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 4637 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 4638 4639 m_temp_reg_70 = _mm_add_epi32(m_temp_reg_14, temp1); /* ee[0] */ 4640 m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_14, temp1); /* ee[7] */ 4641 4642 m_temp_reg_72 = _mm_add_epi32(m_temp_reg_16, temp2); /* ee[1] */ 4643 m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_16, temp2); /* ee[6] */ 4644 4645 m_temp_reg_74 = _mm_add_epi32(m_temp_reg_16, temp3); /* ee[2] */ 4646 m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_16, temp3); /* ee[5] */ 4647 4648 m_temp_reg_76 = _mm_add_epi32(m_temp_reg_14, temp4); /* ee[3] */ 4649 m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_14, temp4); /* ee[4] */ 4650 4651 /* e[]*/ 4652 4653 temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[0] */ 4654 temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[15] */ 4655 4656 temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[1] */ 4657 temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[14] */ 4658 4659 temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[2] */ 4660 temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[13] */ 4661 4662 temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[3] */ 4663 temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[12] */ 4664 4665 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[4] */ 4666 m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[11] */ 4667 4668 m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[5] */ 4669 m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[10] */ 4670 4671 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[6] */ 4672 m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[9] */ 4673 4674 m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[7] */ 4675 m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[8] */ 4676 4677 /*o[k] */ 4678 { 4679 4680 WORD16 *pi2_dst_scratch = temp_ptr; 4681 WORD32 out_stride = 8; 4682 4683 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 4684 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]); 4685 4686 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]); 4687 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]); 4688 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]); 4689 m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]); 4690 4691 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); 4692 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); 4693 4694 /* o0[0-3] */ 4695 { 4696 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4697 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4698 4699 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 4700 4701 m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20); 4702 m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20); 4703 4704 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4705 m_count = _mm_cvtsi32_si128(i4_shift); 4706 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4707 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4708 4709 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4710 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4711 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4712 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4713 4714 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4715 4716 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4717 pi2_dst_scratch += out_stride; 4718 4719 } 4720 4721 4722 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 4723 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]); 4724 4725 /* o1[0-3] */ 4726 { 4727 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4728 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4729 4730 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 4731 4732 m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20); 4733 m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20); 4734 4735 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4736 m_count = _mm_cvtsi32_si128(i4_shift); 4737 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4738 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4739 4740 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4741 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4742 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4743 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4744 4745 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4746 4747 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4748 pi2_dst_scratch += out_stride; 4749 4750 } 4751 4752 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 4753 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]); 4754 4755 /* o2[0-3] */ 4756 { 4757 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4758 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4759 4760 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 4761 4762 m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20); 4763 m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20); 4764 4765 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4766 m_count = _mm_cvtsi32_si128(i4_shift); 4767 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4768 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4769 4770 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4771 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4772 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4773 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4774 4775 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4776 4777 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4778 pi2_dst_scratch += out_stride; 4779 4780 } 4781 4782 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 4783 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]); 4784 4785 /* o3[0-3] */ 4786 { 4787 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4788 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4789 4790 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 4791 4792 m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20); 4793 m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20); 4794 4795 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4796 m_count = _mm_cvtsi32_si128(i4_shift); 4797 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4798 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4799 4800 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4801 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4802 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4803 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4804 4805 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4806 4807 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4808 pi2_dst_scratch += out_stride; 4809 4810 } 4811 4812 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 4813 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]); 4814 4815 /* o4[0-3] */ 4816 { 4817 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4818 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4819 4820 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 4821 4822 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20); 4823 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20); 4824 4825 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4826 m_count = _mm_cvtsi32_si128(i4_shift); 4827 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4828 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4829 4830 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4831 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4832 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4833 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4834 4835 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4836 4837 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4838 pi2_dst_scratch += out_stride; 4839 4840 } 4841 4842 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 4843 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]); 4844 4845 /* o5[0-3] */ 4846 { 4847 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4848 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4849 4850 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 4851 4852 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20); 4853 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20); 4854 4855 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4856 m_count = _mm_cvtsi32_si128(i4_shift); 4857 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4858 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4859 4860 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4861 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4862 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4863 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4864 4865 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4866 4867 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4868 pi2_dst_scratch += out_stride; 4869 4870 } 4871 4872 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 4873 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]); 4874 4875 /* o6[0-3] */ 4876 { 4877 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4878 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4879 4880 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 4881 4882 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20); 4883 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20); 4884 4885 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4886 m_count = _mm_cvtsi32_si128(i4_shift); 4887 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4888 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4889 4890 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4891 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4892 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4893 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4894 4895 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4896 4897 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4898 pi2_dst_scratch += out_stride; 4899 4900 } 4901 4902 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 4903 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]); 4904 4905 /* o7[0-3] */ 4906 { 4907 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4908 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4909 4910 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 4911 4912 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20); 4913 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20); 4914 4915 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4916 m_count = _mm_cvtsi32_si128(i4_shift); 4917 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4918 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4919 4920 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4921 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4922 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4923 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4924 4925 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4926 4927 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4928 pi2_dst_scratch += 8; 4929 4930 } 4931 4932 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 4933 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]); 4934 4935 /* o8[0-3] */ 4936 { 4937 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4938 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4939 4940 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 4941 4942 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20); 4943 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20); 4944 4945 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4946 m_count = _mm_cvtsi32_si128(i4_shift); 4947 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4948 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4949 4950 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4951 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4952 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4953 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4954 4955 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4956 4957 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4958 pi2_dst_scratch += out_stride; 4959 } 4960 4961 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 4962 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]); 4963 4964 /* o9[0-3] */ 4965 { 4966 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4967 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4968 4969 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 4970 4971 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20); 4972 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20); 4973 4974 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4975 m_count = _mm_cvtsi32_si128(i4_shift); 4976 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4977 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4978 4979 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4980 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4981 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4982 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4983 4984 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4985 4986 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4987 pi2_dst_scratch += out_stride; 4988 } 4989 4990 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 4991 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]); 4992 4993 /* o10[0-3] */ 4994 { 4995 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4996 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4997 4998 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 4999 5000 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20); 5001 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20); 5002 5003 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5004 m_count = _mm_cvtsi32_si128(i4_shift); 5005 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5006 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5007 5008 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5009 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5010 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5011 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5012 5013 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5014 5015 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5016 pi2_dst_scratch += out_stride; 5017 } 5018 5019 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 5020 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]); 5021 5022 /* o11[0-3] */ 5023 { 5024 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5025 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5026 5027 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 5028 5029 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20); 5030 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20); 5031 5032 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5033 m_count = _mm_cvtsi32_si128(i4_shift); 5034 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5035 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5036 5037 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5038 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5039 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5040 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5041 5042 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5043 5044 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5045 pi2_dst_scratch += out_stride; 5046 5047 } 5048 5049 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 5050 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]); 5051 5052 /* o12[0-3] */ 5053 { 5054 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5055 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5056 5057 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 5058 5059 m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20); 5060 m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20); 5061 5062 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5063 m_count = _mm_cvtsi32_si128(i4_shift); 5064 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5065 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5066 5067 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5068 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5069 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5070 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5071 5072 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5073 5074 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5075 pi2_dst_scratch += out_stride; 5076 5077 } 5078 5079 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 5080 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]); 5081 5082 /* o13[0-3] */ 5083 { 5084 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5085 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5086 5087 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 5088 5089 m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20); 5090 m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20); 5091 5092 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5093 m_count = _mm_cvtsi32_si128(i4_shift); 5094 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5095 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5096 5097 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5098 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5099 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5100 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5101 5102 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5103 5104 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5105 pi2_dst_scratch += out_stride; 5106 } 5107 5108 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 5109 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]); 5110 5111 /* o14[0-3] */ 5112 { 5113 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5114 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5115 5116 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 5117 5118 m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20); 5119 m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20); 5120 5121 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5122 m_count = _mm_cvtsi32_si128(i4_shift); 5123 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5124 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5125 5126 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5127 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5128 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5129 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5130 5131 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5132 5133 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5134 pi2_dst_scratch += out_stride; 5135 } 5136 5137 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 5138 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]); 5139 5140 /* o15[0-3] */ 5141 { 5142 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5143 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5144 5145 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 5146 5147 m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20); 5148 m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20); 5149 5150 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5151 m_count = _mm_cvtsi32_si128(i4_shift); 5152 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5153 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5154 5155 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5156 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5157 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5158 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5159 5160 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5161 5162 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5163 pi2_dst_scratch += 8; 5164 } 5165 5166 } 5167 } 5168 else 5169 { 5170 /* eo */ 5171 { 5172 5173 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87 5174 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70 5175 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43 5176 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9 5177 5178 5179 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]); 5180 m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]); 5181 m_temp_reg_12 = _mm_loadu_si128((__m128i *)&pi2_tmp[10 * trans_size]); 5182 m_temp_reg_13 = _mm_loadu_si128((__m128i *)&pi2_tmp[14 * trans_size]); 5183 m_temp_reg_18 = _mm_loadu_si128((__m128i *)&pi2_tmp[18 * trans_size]); 5184 m_temp_reg_19 = _mm_loadu_si128((__m128i *)&pi2_tmp[22 * trans_size]); 5185 m_temp_reg_20 = _mm_loadu_si128((__m128i *)&pi2_tmp[26 * trans_size]); 5186 m_temp_reg_21 = _mm_loadu_si128((__m128i *)&pi2_tmp[30 * trans_size]); 5187 5188 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11); 5189 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_12, m_temp_reg_13); 5190 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_18, m_temp_reg_19); 5191 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_21); 5192 5193 /* eo0[0-3] */ 5194 { 5195 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5196 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5197 5198 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5199 5200 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5201 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5202 5203 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 5204 5205 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 5206 5207 } 5208 5209 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57 5210 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0 -43 5211 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 90 5212 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 25 5213 5214 /* eo1[0-3] */ 5215 { 5216 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5217 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5218 5219 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5220 5221 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5222 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5223 5224 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 5225 5226 m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32); 5227 5228 } 5229 5230 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9 5231 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70 87 5232 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25 57 5233 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90 43 5234 5235 /* eo2[0-3] */ 5236 { 5237 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5238 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5239 5240 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31); 5241 5242 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5243 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5244 5245 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 5246 5247 m_temp_reg_92 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 5248 5249 } 5250 5251 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43 5252 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87 9 5253 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90 25 5254 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80 57 5255 5256 /* eo3[0-3] */ 5257 { 5258 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5259 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5260 5261 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5262 5263 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5264 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5265 5266 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33); 5267 5268 m_temp_reg_93 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 5269 5270 } 5271 5272 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80 5273 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25 90 5274 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9 87 5275 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 70 5276 5277 5278 /* eo4[0-3] */ 5279 { 5280 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5281 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5282 5283 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5284 5285 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5286 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5287 5288 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32); 5289 5290 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 5291 5292 } 5293 5294 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90 5295 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57 25 5296 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87 70 5297 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9 -80 5298 5299 /* eo5[0-3] */ 5300 { 5301 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5302 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5303 5304 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5305 5306 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5307 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5308 5309 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 5310 5311 m_temp_reg_95 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 5312 } 5313 5314 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70 5315 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90 -80 5316 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43 9 5317 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57 87 5318 5319 /* eo6[0-3] */ 5320 { 5321 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5322 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5323 5324 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5325 5326 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5327 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5328 5329 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 5330 5331 m_temp_reg_96 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 5332 5333 } 5334 5335 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25 5336 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43 -57 5337 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70 -80 5338 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87 -90 5339 5340 /* eo7[0-3] */ 5341 { 5342 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5343 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5344 5345 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5346 5347 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5348 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5349 5350 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 5351 5352 m_temp_reg_97 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 5353 5354 5355 } 5356 5357 } 5358 5359 /* eeo */ 5360 { 5361 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75 5362 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18 5363 5364 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]); 5365 m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[12 * trans_size]); 5366 m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[20 * trans_size]); 5367 m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[28 * trans_size]); 5368 5369 /* eeo0[0-3] */ 5370 { 5371 5372 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 5373 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86); 5374 5375 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5376 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5377 5378 temp1 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5379 5380 } 5381 5382 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18 5383 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89 50 5384 5385 /* eeo1[0-3] */ 5386 { 5387 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 5388 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 5389 5390 temp2 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31); 5391 5392 } 5393 5394 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89 5395 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 75 5396 5397 /* eo2[0-3] */ 5398 { 5399 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 5400 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 5401 5402 temp3 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5403 5404 } 5405 5406 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50 5407 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75 -89 5408 5409 /* eo3[0-3] */ 5410 { 5411 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 5412 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 5413 5414 temp4 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5415 5416 } 5417 5418 5419 } 5420 5421 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36 5422 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83 5423 5424 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64 5425 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64 5426 5427 m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[8 * trans_size]); 5428 m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[24 * trans_size]); 5429 5430 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84); 5431 5432 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]); 5433 m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[16 * trans_size]); 5434 5435 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80); 5436 5437 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */ 5438 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */ 5439 5440 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */ 5441 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */ 5442 5443 /* eeeo[0]= m_temp_reg_20 */ 5444 /* eeeo[1]= m_temp_reg_21 */ 5445 /* eeee[0]= m_temp_reg_22 */ 5446 /* eeee[1]= m_temp_reg_23 */ 5447 5448 /* eee[0] = eeee[0] + eeeo[0]; */ 5449 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */ 5450 5451 /* eee[3] = eeee[0] - eeeo[0]; */ 5452 m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */ 5453 5454 /* eee[2] = eeee[1] - eeeo[1]; */ 5455 m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */ 5456 5457 /* eee[1] = eeee[1] + eeeo[1];*/ 5458 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */ 5459 5460 m_temp_reg_70 = _mm_add_epi32(m_temp_reg_40, temp1); /* ee[0] */ 5461 m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_40, temp1); /* ee[7] */ 5462 5463 m_temp_reg_72 = _mm_add_epi32(m_temp_reg_41, temp2); /* ee[1] */ 5464 m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_41, temp2); /* ee[6] */ 5465 5466 m_temp_reg_74 = _mm_add_epi32(m_temp_reg_42, temp3); /* ee[2] */ 5467 m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_42, temp3); /* ee[5] */ 5468 5469 m_temp_reg_76 = _mm_add_epi32(m_temp_reg_43, temp4); /* ee[3] */ 5470 m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_43, temp4); /* ee[4] */ 5471 5472 /* e[]*/ 5473 5474 temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[0] */ 5475 temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[15] */ 5476 5477 temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[1] */ 5478 temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[14] */ 5479 5480 temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[2] */ 5481 temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[13] */ 5482 5483 temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[3] */ 5484 temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[12] */ 5485 5486 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[4] */ 5487 m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[11] */ 5488 5489 m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[5] */ 5490 m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[10] */ 5491 5492 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[6] */ 5493 m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[9] */ 5494 5495 m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[7] */ 5496 m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[8] */ 5497 5498 /*o[k] */ 5499 { 5500 5501 WORD16 *pi2_dst_scratch = temp_ptr; 5502 WORD32 out_stride = 8; 5503 5504 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 5505 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]); 5506 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]); 5507 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]); 5508 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]); 5509 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]); 5510 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]); 5511 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]); 5512 5513 5514 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]); 5515 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]); 5516 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]); 5517 m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]); 5518 m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[9 * trans_size]); 5519 m_temp_reg_75 = _mm_loadu_si128((__m128i *)&pi2_tmp[11 * trans_size]); 5520 m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[13 * trans_size]); 5521 m_temp_reg_77 = _mm_loadu_si128((__m128i *)&pi2_tmp[15 * trans_size]); 5522 5523 m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[17 * trans_size]); 5524 m_temp_reg_81 = _mm_loadu_si128((__m128i *)&pi2_tmp[19 * trans_size]); 5525 m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[21 * trans_size]); 5526 m_temp_reg_83 = _mm_loadu_si128((__m128i *)&pi2_tmp[23 * trans_size]); 5527 m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[25 * trans_size]); 5528 m_temp_reg_85 = _mm_loadu_si128((__m128i *)&pi2_tmp[27 * trans_size]); 5529 m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[29 * trans_size]); 5530 m_temp_reg_87 = _mm_loadu_si128((__m128i *)&pi2_tmp[31 * trans_size]); 5531 5532 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved 5533 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved 5534 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved 5535 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved 5536 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved 5537 m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved 5538 m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved 5539 m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved 5540 5541 /* o0[0-3] */ 5542 { 5543 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5544 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5545 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5546 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5547 5548 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 5549 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5550 5551 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 5552 5553 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5554 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5555 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5556 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5557 5558 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 5559 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5560 5561 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 5562 5563 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5564 5565 m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20); 5566 m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20); 5567 5568 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5569 m_count = _mm_cvtsi32_si128(i4_shift); 5570 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5571 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5572 5573 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5574 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5575 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5576 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5577 5578 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5579 5580 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5581 pi2_dst_scratch += out_stride; 5582 5583 } 5584 5585 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 5586 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]); 5587 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]); 5588 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]); 5589 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]); 5590 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]); 5591 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]); 5592 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]); 5593 5594 /* o1[0-3] */ 5595 { 5596 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5597 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5598 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5599 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5600 5601 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 5602 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5603 5604 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20); 5605 5606 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5607 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5608 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5609 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5610 5611 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 5612 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5613 5614 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 5615 5616 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5617 5618 m_temp_reg_31 = _mm_add_epi32(temp3, m_temp_reg_20); 5619 m_temp_reg_30 = _mm_sub_epi32(temp3, m_temp_reg_20); 5620 5621 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5622 m_count = _mm_cvtsi32_si128(i4_shift); 5623 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5624 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5625 5626 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5627 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5628 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5629 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5630 5631 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5632 5633 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5634 pi2_dst_scratch += out_stride; 5635 5636 } 5637 5638 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 5639 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]); 5640 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]); 5641 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]); 5642 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]); 5643 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]); 5644 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]); 5645 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]); 5646 5647 /* o2[0-3] */ 5648 { 5649 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5650 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5651 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5652 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5653 5654 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 5655 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5656 5657 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 5658 5659 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5660 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5661 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5662 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5663 5664 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41); 5665 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5666 5667 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42); 5668 5669 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5670 5671 m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20); 5672 m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20); 5673 5674 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5675 m_count = _mm_cvtsi32_si128(i4_shift); 5676 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5677 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5678 5679 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5680 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5681 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5682 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5683 5684 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5685 5686 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5687 pi2_dst_scratch += out_stride; 5688 5689 } 5690 5691 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 5692 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]); 5693 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]); 5694 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]); 5695 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]); 5696 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]); 5697 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]); 5698 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]); 5699 5700 /* o3[0-3] */ 5701 { 5702 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5703 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5704 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5705 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5706 5707 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 5708 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5709 5710 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 5711 5712 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5713 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5714 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5715 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5716 5717 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40); 5718 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5719 5720 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 5721 5722 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5723 5724 m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20); 5725 m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20); 5726 5727 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5728 m_count = _mm_cvtsi32_si128(i4_shift); 5729 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5730 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5731 5732 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5733 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5734 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5735 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5736 5737 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5738 5739 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5740 pi2_dst_scratch += out_stride; 5741 5742 } 5743 5744 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 5745 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]); 5746 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]); 5747 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]); 5748 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]); 5749 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]); 5750 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]); 5751 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]); 5752 5753 /* o4[0-3] */ 5754 { 5755 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5756 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5757 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5758 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5759 5760 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 5761 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5762 5763 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 5764 5765 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5766 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5767 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5768 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5769 5770 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 5771 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5772 5773 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 5774 5775 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5776 5777 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20); 5778 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20); 5779 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5780 m_count = _mm_cvtsi32_si128(i4_shift); 5781 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5782 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5783 5784 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5785 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5786 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5787 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5788 5789 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5790 5791 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5792 pi2_dst_scratch += out_stride; 5793 5794 } 5795 5796 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 5797 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]); 5798 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]); 5799 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]); 5800 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]); 5801 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]); 5802 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]); 5803 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]); 5804 5805 /* o5[0-3] */ 5806 { 5807 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5808 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5809 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5810 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5811 5812 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 5813 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5814 5815 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 5816 5817 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5818 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5819 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5820 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5821 5822 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 5823 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5824 5825 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 5826 5827 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5828 5829 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20); 5830 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20); 5831 5832 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5833 m_count = _mm_cvtsi32_si128(i4_shift); 5834 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5835 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5836 5837 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5838 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5839 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5840 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5841 5842 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5843 5844 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5845 pi2_dst_scratch += out_stride; 5846 5847 } 5848 5849 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 5850 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]); 5851 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]); 5852 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]); 5853 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]); 5854 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]); 5855 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]); 5856 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]); 5857 5858 /* o6[0-3] */ 5859 { 5860 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5861 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5862 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5863 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5864 5865 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 5866 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5867 5868 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 5869 5870 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5871 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5872 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5873 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5874 5875 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 5876 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5877 5878 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 5879 5880 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5881 5882 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20); 5883 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20); 5884 5885 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5886 m_count = _mm_cvtsi32_si128(i4_shift); 5887 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5888 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5889 5890 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5891 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5892 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5893 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5894 5895 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5896 5897 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5898 pi2_dst_scratch += out_stride; 5899 5900 } 5901 5902 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 5903 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]); 5904 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]); 5905 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]); 5906 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]); 5907 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]); 5908 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]); 5909 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]); 5910 5911 /* o7[0-3] */ 5912 { 5913 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5914 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5915 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5916 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5917 5918 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 5919 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5920 5921 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 5922 5923 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5924 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5925 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5926 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5927 5928 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 5929 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5930 5931 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 5932 5933 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5934 5935 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20); 5936 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20); 5937 5938 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5939 m_count = _mm_cvtsi32_si128(i4_shift); 5940 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5941 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5942 5943 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5944 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5945 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5946 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5947 5948 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5949 5950 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5951 pi2_dst_scratch += 8; 5952 5953 } 5954 5955 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 5956 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]); 5957 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]); 5958 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]); 5959 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]); 5960 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]); 5961 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]); 5962 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]); 5963 5964 /* o8[0-3] */ 5965 { 5966 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5967 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5968 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5969 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5970 5971 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 5972 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5973 5974 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 5975 5976 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5977 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5978 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5979 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5980 5981 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 5982 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5983 5984 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 5985 5986 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5987 5988 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20); 5989 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20); 5990 5991 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5992 m_count = _mm_cvtsi32_si128(i4_shift); 5993 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5994 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5995 5996 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5997 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5998 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5999 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 6000 6001 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 6002 6003 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 6004 pi2_dst_scratch += out_stride; 6005 } 6006 6007 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 6008 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]); 6009 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]); 6010 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]); 6011 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]); 6012 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]); 6013 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]); 6014 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]); 6015 6016 /* o9[0-3] */ 6017 { 6018 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 6019 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 6020 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 6021 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 6022 6023 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 6024 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 6025 6026 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 6027 6028 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 6029 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 6030 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 6031 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 6032 6033 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 6034 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 6035 6036 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 6037 6038 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 6039 6040 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20); 6041 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20); 6042 6043 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 6044 m_count = _mm_cvtsi32_si128(i4_shift); 6045 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 6046 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 6047 6048 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 6049 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 6050 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 6051 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 6052 6053 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 6054 6055 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 6056 pi2_dst_scratch += out_stride; 6057 } 6058 6059 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 6060 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]); 6061 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]); 6062 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]); 6063 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]); 6064 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]); 6065 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]); 6066 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]); 6067 6068 /* o10[0-3] */ 6069 { 6070 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 6071 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 6072 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 6073 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 6074 6075 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 6076 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 6077 6078 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 6079 6080 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 6081 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 6082 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 6083 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 6084 6085 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 6086 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 6087 6088 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 6089 6090 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 6091 6092 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20); 6093 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20); 6094 6095 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 6096 m_count = _mm_cvtsi32_si128(i4_shift); 6097 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 6098 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 6099 6100 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 6101 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 6102 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 6103 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 6104 6105 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 6106 6107 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 6108 pi2_dst_scratch += out_stride; 6109 } 6110 6111 6112 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 6113 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]); 6114 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]); 6115 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]); 6116 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]); 6117 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]); 6118 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]); 6119 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]); 6120 6121 /* o11[0-3] */ 6122 { 6123 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 6124 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 6125 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 6126 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 6127 6128 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 6129 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 6130 6131 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 6132 6133 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 6134 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 6135 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 6136 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 6137 6138 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 6139 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 6140 6141 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 6142 6143 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 6144 6145 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20); 6146 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20); 6147 6148 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 6149 m_count = _mm_cvtsi32_si128(i4_shift); 6150 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 6151 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 6152 6153 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 6154 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 6155 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 6156 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 6157 6158 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 6159 6160 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 6161 pi2_dst_scratch += out_stride; 6162 6163 } 6164 6165 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 6166 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]); 6167 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]); 6168 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]); 6169 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]); 6170 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]); 6171 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]); 6172 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]); 6173 6174 /* o12[0-3] */ 6175 { 6176 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 6177 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 6178 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 6179 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 6180 6181 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 6182 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 6183 6184 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 6185 6186 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 6187 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 6188 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 6189 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 6190 6191 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 6192 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 6193 6194 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 6195 6196 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 6197 6198 m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20); 6199 m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20); 6200 6201 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 6202 m_count = _mm_cvtsi32_si128(i4_shift); 6203 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 6204 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 6205 6206 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 6207 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 6208 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 6209 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 6210 6211 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 6212 6213 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 6214 pi2_dst_scratch += out_stride; 6215 6216 } 6217 6218 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 6219 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]); 6220 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]); 6221 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]); 6222 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]); 6223 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]); 6224 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]); 6225 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]); 6226 6227 /* o13[0-3] */ 6228 { 6229 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 6230 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 6231 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 6232 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 6233 6234 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 6235 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 6236 6237 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 6238 6239 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 6240 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 6241 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 6242 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 6243 6244 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 6245 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 6246 6247 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 6248 6249 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 6250 6251 m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20); 6252 m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20); 6253 6254 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 6255 m_count = _mm_cvtsi32_si128(i4_shift); 6256 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 6257 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 6258 6259 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 6260 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 6261 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 6262 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 6263 6264 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 6265 6266 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 6267 pi2_dst_scratch += out_stride; 6268 } 6269 6270 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 6271 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]); 6272 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]); 6273 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]); 6274 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]); 6275 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]); 6276 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]); 6277 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]); 6278 6279 /* o14[0-3] */ 6280 { 6281 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 6282 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 6283 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 6284 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 6285 6286 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 6287 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 6288 6289 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 6290 6291 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 6292 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 6293 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 6294 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 6295 6296 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 6297 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 6298 6299 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 6300 6301 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 6302 6303 m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20); 6304 m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20); 6305 6306 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 6307 m_count = _mm_cvtsi32_si128(i4_shift); 6308 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 6309 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 6310 6311 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 6312 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 6313 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 6314 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 6315 6316 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 6317 6318 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 6319 pi2_dst_scratch += out_stride; 6320 6321 } 6322 6323 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 6324 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]); 6325 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]); 6326 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]); 6327 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]); 6328 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]); 6329 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]); 6330 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]); 6331 6332 /* o15[0-3] */ 6333 { 6334 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 6335 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 6336 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 6337 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 6338 6339 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 6340 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 6341 6342 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 6343 6344 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 6345 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 6346 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 6347 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 6348 6349 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 6350 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 6351 6352 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 6353 6354 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 6355 6356 m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20); 6357 m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20); 6358 6359 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 6360 m_count = _mm_cvtsi32_si128(i4_shift); 6361 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 6362 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 6363 6364 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 6365 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 6366 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 6367 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 6368 6369 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 6370 6371 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 6372 pi2_dst_scratch += 8; 6373 } 6374 6375 } 6376 } 6377 6378 /* Transpose */ 6379 { 6380 6381 WORD16 *pi2_src_scratch = temp_ptr; 6382 WORD32 out_stride = dst_strd; 6383 WORD32 in_stride = 8; 6384 6385 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 6386 pi2_src_scratch += in_stride; 6387 m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch); 6388 pi2_src_scratch += in_stride; 6389 m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch); 6390 pi2_src_scratch += in_stride; 6391 m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch); 6392 pi2_src_scratch += in_stride; 6393 m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch); 6394 pi2_src_scratch += in_stride; 6395 m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch); 6396 pi2_src_scratch += in_stride; 6397 m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch); 6398 pi2_src_scratch += in_stride; 6399 m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch); 6400 pi2_src_scratch += 8; 6401 6402 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch); 6403 pi2_src_scratch += in_stride; 6404 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch); 6405 pi2_src_scratch += in_stride; 6406 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch); 6407 pi2_src_scratch += in_stride; 6408 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch); 6409 pi2_src_scratch += in_stride; 6410 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch); 6411 pi2_src_scratch += in_stride; 6412 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch); 6413 pi2_src_scratch += in_stride; 6414 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch); 6415 pi2_src_scratch += in_stride; 6416 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch); 6417 pi2_src_scratch += 8; 6418 6419 6420 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); 6421 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); 6422 6423 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); 6424 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); 6425 6426 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); 6427 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); 6428 6429 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); 6430 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); 6431 6432 m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); 6433 m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70); 6434 6435 m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); 6436 m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72); 6437 6438 m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); 6439 m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74); 6440 6441 m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); 6442 m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76); 6443 6444 6445 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); 6446 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); 6447 6448 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); 6449 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); 6450 6451 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82); 6452 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82); 6453 6454 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86); 6455 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86); 6456 6457 m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); 6458 m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); 6459 6460 m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); 6461 m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); 6462 6463 m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81); 6464 m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81); 6465 6466 m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85); 6467 m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85); 6468 6469 6470 m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2); // row0 = 0-7 6471 m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2); // row1 = 0-7 6472 6473 m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90); // row0=24-31 6474 m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90); // row1=24-31 6475 6476 m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6); // row0=8-15 6477 m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6); // row1=8-15 6478 6479 m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94); // row0=16-23 6480 m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94); // row1=16-23 6481 6482 m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3); // row2 =0-7 6483 m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3); // row3 =0-7 6484 6485 m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91); // row2=24-31 6486 m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91); // row3=24-31 6487 6488 m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7); // row2=8-15 6489 m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7); // row3=8-15 6490 6491 m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95); // row2=16-23 6492 m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95); // row3=16-23 6493 6494 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred); 6495 6496 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20); 6497 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 6498 6499 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_30, m_temp_reg_0); 6500 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 6501 6502 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0); 6503 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 6504 6505 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_34, m_temp_reg_0); 6506 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 6507 6508 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 6509 6510 m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16)); 6511 6512 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20); 6513 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 6514 6515 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_36, m_temp_reg_0); 6516 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 6517 6518 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0); 6519 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 6520 6521 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_32, m_temp_reg_0); 6522 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 6523 6524 _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20); 6525 pu1_dst += out_stride; 6526 pu1_pred += pred_strd; 6527 6528 6529 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred); 6530 6531 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20); 6532 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 6533 6534 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_31, m_temp_reg_0); 6535 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 6536 6537 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0); 6538 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 6539 6540 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_35, m_temp_reg_0); 6541 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 6542 6543 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 6544 6545 m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16)); 6546 6547 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20); 6548 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 6549 6550 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_37, m_temp_reg_0); 6551 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 6552 6553 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0); 6554 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 6555 6556 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_33, m_temp_reg_0); 6557 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 6558 6559 _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20); 6560 pu1_dst += out_stride; 6561 pu1_pred += pred_strd; 6562 6563 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred); 6564 6565 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20); 6566 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 6567 6568 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_80, m_temp_reg_0); 6569 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 6570 6571 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0); 6572 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 6573 6574 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_84, m_temp_reg_0); 6575 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 6576 6577 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 6578 6579 m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16)); 6580 6581 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20); 6582 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 6583 6584 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_86, m_temp_reg_0); 6585 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 6586 6587 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0); 6588 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 6589 6590 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_82, m_temp_reg_0); 6591 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 6592 6593 _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20); 6594 pu1_dst += out_stride; 6595 pu1_pred += pred_strd; 6596 6597 6598 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred); 6599 6600 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20); 6601 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 6602 6603 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_81, m_temp_reg_0); 6604 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 6605 6606 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0); 6607 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 6608 6609 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_85, m_temp_reg_0); 6610 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 6611 6612 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 6613 6614 m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16)); 6615 6616 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20); 6617 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 6618 6619 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_87, m_temp_reg_0); 6620 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 6621 6622 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0); 6623 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 6624 6625 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_83, m_temp_reg_0); 6626 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 6627 6628 _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20); 6629 pu1_dst += out_stride; 6630 pu1_pred += pred_strd; 6631 6632 } 6633 pi2_tmp += 4; 6634 } 6635 } 6636 6637