1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevc_iquant_itrans_recon_atom_intr.c 22 * 23 * @brief 24 * Contains function definitions for inverse quantization, inverse 25 * transform and reconstruction 26 * 27 * @author 28 * 100470 29 * 100592 (edited by) 30 * 31 * @par List of Functions: 32 * - ihevc_iquant_itrans_recon_16x16_ssse3() 33 * 34 * @remarks 35 * None 36 * 37 ******************************************************************************* 38 */ 39 #include <stdio.h> 40 #include <string.h> 41 #include "ihevc_typedefs.h" 42 #include "ihevc_macros.h" 43 #include "ihevc_platform_macros.h" 44 #include "ihevc_defs.h" 45 #include "ihevc_trans_tables.h" 46 #include "ihevc_itrans_recon.h" 47 #include "ihevc_func_selector.h" 48 #include "ihevc_trans_macros.h" 49 50 51 52 #include <immintrin.h> 53 #include <emmintrin.h> 54 55 #include <tmmintrin.h> 56 57 58 /** 59 ******************************************************************************* 60 * 61 * @brief 62 * This function performs inverse quantization, inverse transform and 63 * reconstruction for 16x16 input block 64 * 65 * @par Description: 66 * Performs inverse quantization , inverse transform and adds the 67 * prediction data and clips output to 8 bit 68 * 69 * @param[in] pi2_src 70 * Input 16x16 coefficients 71 * 72 * @param[in] pi2_tmp 73 * Temporary 16x16 buffer for storing inverse 74 * transform 1st stage output 75 * 76 * @param[in] pu1_pred 77 * Prediction 16x16 block 78 * 79 * @param[in] pi2_dequant_coeff 80 * Dequant Coeffs 81 * 82 * @param[out] pu1_dst 83 * Output 16x16 block 84 * 85 * @param[in] qp_div 86 * Quantization parameter / 6 87 * 88 * @param[in] qp_rem 89 * Quantization parameter % 6 90 * 91 * @param[in] src_strd 92 * Input stride 93 * 94 * @param[in] pred_strd 95 * Prediction stride 96 * 97 * @param[in] dst_strd 98 * Output Stride 99 * 100 * @param[in] zero_cols 101 * Zero columns in pi2_src 102 * 103 * @returns Void 104 * 105 * @remarks 106 * None 107 * 108 ******************************************************************************* 109 */ 110 111 void ihevc_itrans_recon_16x16_ssse3(WORD16 *pi2_src, 112 WORD16 *pi2_tmp, 113 UWORD8 *pu1_pred, 114 UWORD8 *pu1_dst, 115 WORD32 src_strd, 116 WORD32 pred_strd, 117 WORD32 dst_strd, 118 WORD32 zero_cols, 119 WORD32 zero_rows) 120 { 121 __m128i m_temp_reg_0; 122 __m128i m_temp_reg_1; 123 __m128i m_temp_reg_10; 124 __m128i m_temp_reg_11; 125 __m128i m_temp_reg_12; 126 __m128i m_temp_reg_13; 127 __m128i m_temp_reg_14; 128 129 __m128i m_temp_reg_20; 130 __m128i m_temp_reg_21; 131 __m128i m_temp_reg_22; 132 __m128i m_temp_reg_23; 133 __m128i m_temp_reg_24; 134 __m128i m_temp_reg_25; 135 __m128i m_temp_reg_26; 136 __m128i m_temp_reg_27; 137 __m128i m_temp_reg_30; 138 __m128i m_temp_reg_31; 139 __m128i m_temp_reg_32; 140 __m128i m_temp_reg_33; 141 __m128i m_temp_reg_34; 142 __m128i m_temp_reg_35; 143 __m128i m_temp_reg_36; 144 __m128i m_temp_reg_37; 145 __m128i m_temp_reg_40; 146 __m128i m_temp_reg_41; 147 __m128i m_temp_reg_42; 148 __m128i m_temp_reg_43; 149 __m128i m_temp_reg_44; 150 __m128i m_temp_reg_45; 151 __m128i m_temp_reg_46; 152 __m128i m_temp_reg_47; 153 154 __m128i m_temp_reg_70; 155 __m128i m_temp_reg_71; 156 __m128i m_temp_reg_72; 157 __m128i m_temp_reg_73; 158 __m128i m_temp_reg_74; 159 __m128i m_temp_reg_75; 160 __m128i m_temp_reg_76; 161 __m128i m_temp_reg_77; 162 __m128i m_rdng_factor; 163 __m128i m_count; 164 __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4; 165 __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8; 166 167 WORD32 i; 168 /*Lokesh*/ 169 WORD32 zero_last8_cols_stg1; 170 WORD32 zero_last8_rows_stg1; 171 WORD32 zero_last12_rows_stg1; 172 WORD32 zero_last12_rows_stg2; 173 WORD32 zero_last8_rows_stg2; 174 175 WORD32 loop = 0; 176 177 WORD32 i4_shift = IT_SHIFT_STAGE_1; 178 WORD32 trans_size = TRANS_SIZE_16; 179 180 181 182 183 /* Following 3 instructions replicates the value in the */ 184 /* lower 16 bits of m_add_iq in the entire register */ 185 186 /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */ 187 188 zero_last8_cols_stg1 = ((zero_cols & 0xFF00) == 0xFF00) ? 1 : 0; 189 zero_last8_rows_stg1 = ((zero_rows & 0xFF00) == 0xFF00) ? 1 : 0; 190 zero_last12_rows_stg1 = ((zero_rows & 0xFFF0) == 0xFFF0) ? 1 : 0; 191 192 zero_last12_rows_stg2 = ((zero_cols & 0xFFF0) == 0xFFF0) ? 1 : 0; 193 zero_last8_rows_stg2 = zero_last8_cols_stg1; 194 if(zero_last8_cols_stg1) 195 { 196 loop = 1; 197 } 198 else 199 loop = 2; 200 201 /* i = 0 => lower 8 samples */ 202 /* i = 1 => higher 8 samples */ 203 for(i = 0; i < loop; i++) 204 { 205 { 206 WORD32 sample_half_index = i << 3; 207 WORD16 *pi2_tmp_src = pi2_src + sample_half_index; 208 WORD16 *pi2_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; 209 210 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src); 211 pi2_tmp_src += (src_strd << 1); 212 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src); 213 pi2_tmp_src += (src_strd << 1); 214 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src); 215 pi2_tmp_src += (src_strd << 1); 216 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src); 217 pi2_tmp_src += (src_strd << 1); 218 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src); 219 pi2_tmp_src += (src_strd << 1); 220 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src); 221 pi2_tmp_src += (src_strd << 1); 222 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src); 223 pi2_tmp_src += (src_strd << 1); 224 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src); 225 pi2_tmp_src += (src_strd << 1); 226 227 228 229 230 /* If last 12 rows are zero : Rishab */ 231 if(zero_last12_rows_stg1) 232 { 233 234 /* eee */ 235 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 236 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 237 { 238 /* Loading coeff and src for use in next block */ 239 240 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get sign 241 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 242 243 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); 244 245 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); 246 247 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); 248 249 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 250 251 m_temp_reg_26 = m_temp_reg_24; 252 m_temp_reg_27 = m_temp_reg_25; 253 } 254 255 /* eo */ 256 257 /* eo0[0-3] */ 258 { 259 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 260 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 261 262 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 263 264 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 265 266 /* e[0][0-3] stored in pi2_tmp[0][0-7] */ 267 /* e[7][0-3] stored in pi2_tmp[0][8-15] */ 268 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); 269 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); 270 271 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 272 pi2_scratch += 8; 273 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 274 pi2_scratch += 8; 275 276 } 277 278 279 /* eo0[4-7] */ 280 { 281 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 282 283 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 284 285 /* e[0][4-7] stored in pi2_tmp[1][0-7] */ 286 /* e[7][4-7] stored in pi2_tmp[1][8-15] */ 287 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); 288 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); 289 290 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 291 pi2_scratch += 8; 292 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 293 pi2_scratch += 8; 294 295 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 296 } 297 298 /* eo1[0-3] */ 299 { 300 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 301 302 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 303 304 /* e[1][0-3] stored in pi2_tmp[2][0-7] */ 305 /* e[6][0-3] stored in pi2_tmp[2][8-15] */ 306 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); 307 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); 308 309 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 310 pi2_scratch += 8; 311 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 312 pi2_scratch += 8; 313 } 314 315 /* eo1[4-7] */ 316 { 317 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 318 319 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 320 321 /* e[1][4-7] stored in pi2_tmp[3][0-7] */ 322 /* e[6][4-7] stored in pi2_tmp[3][8-15] */ 323 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); 324 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); 325 326 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 327 pi2_scratch += 8; 328 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 329 pi2_scratch += 8; 330 331 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 332 333 } 334 335 /* eo2[0-3] */ 336 { 337 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 338 339 /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 340 /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 341 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); 342 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); 343 344 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 345 pi2_scratch += 8; 346 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 347 pi2_scratch += 8; 348 349 } 350 351 /* eo2[4-7] */ 352 { 353 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 354 355 /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 356 /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 357 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); 358 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); 359 360 361 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 362 pi2_scratch += 8; 363 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 364 pi2_scratch += 8; 365 366 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 367 } 368 369 /* eo3[0-3] */ 370 { 371 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 372 373 /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 374 /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 375 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); 376 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); 377 378 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 379 pi2_scratch += 8; 380 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 381 pi2_scratch += 8; 382 } 383 384 /* eo3[4-7] */ 385 { 386 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 387 388 /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 389 /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 390 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); 391 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); 392 393 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 394 pi2_scratch += 8; 395 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 396 pi2_scratch += 8; 397 } 398 } 399 /* If last 8 rows are zero : Rishab */ 400 else if(zero_last8_rows_stg1) 401 { 402 /* eeo */ 403 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 404 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 405 { 406 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 407 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 408 409 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's 410 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's 411 412 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 413 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 414 415 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 416 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 417 418 } 419 420 /* eee */ 421 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 422 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 423 { 424 /* Loading coeff and src for use in next block */ 425 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get signs 426 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 427 428 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); 429 430 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); 431 432 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); 433 434 m_temp_reg_26 = m_temp_reg_24; 435 m_temp_reg_27 = m_temp_reg_25; 436 437 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 438 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 439 } 440 441 /* eo0[0-3] */ 442 { 443 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 444 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 445 446 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 447 448 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 449 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); 450 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); 451 452 /* e[0][0-3] stored in pi2_tmp[0][0-7] */ 453 /* e[7][0-3] stored in pi2_tmp[0][8-15] */ 454 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 455 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 456 457 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 458 pi2_scratch += 8; 459 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 460 pi2_scratch += 8; 461 462 } 463 464 /* eo0[4-7] */ 465 { 466 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 467 468 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 469 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); 470 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); 471 472 /* e[0][4-7] stored in pi2_tmp[1][0-7] */ 473 /* e[7][4-7] stored in pi2_tmp[1][8-15] */ 474 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 475 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 476 477 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 478 pi2_scratch += 8; 479 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 480 pi2_scratch += 8; 481 482 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 483 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 484 485 } 486 487 /* eo1[0-3] */ 488 { 489 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 490 491 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 492 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); 493 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); 494 495 /* e[1][0-3] stored in pi2_tmp[2][0-7] */ 496 /* e[6][0-3] stored in pi2_tmp[2][8-15] */ 497 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 498 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 499 500 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 501 pi2_scratch += 8; 502 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 503 pi2_scratch += 8; 504 505 } 506 507 /* eo1[4-7] */ 508 { 509 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 510 511 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 512 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); 513 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); 514 515 /* e[1][4-7] stored in pi2_tmp[3][0-7] */ 516 /* e[6][4-7] stored in pi2_tmp[3][8-15] */ 517 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); 518 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); 519 520 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 521 pi2_scratch += 8; 522 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 523 pi2_scratch += 8; 524 525 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 526 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75 527 528 } 529 530 /* eo2[0-3] */ 531 { 532 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 533 534 /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 535 /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 536 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 537 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 538 539 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 540 pi2_scratch += 8; 541 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 542 pi2_scratch += 8; 543 544 } 545 546 /* eo2[4-7] */ 547 { 548 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 549 550 /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 551 /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 552 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); 553 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); 554 555 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 556 pi2_scratch += 8; 557 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 558 pi2_scratch += 8; 559 560 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 561 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 562 563 } 564 565 /* eo3[0-3] */ 566 { 567 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 568 569 /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 570 /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 571 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 572 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 573 574 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 575 pi2_scratch += 8; 576 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 577 pi2_scratch += 8; 578 } 579 580 /* eo3[4-7] */ 581 { 582 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 583 584 /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 585 /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 586 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); 587 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); 588 589 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 590 pi2_scratch += 8; 591 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 592 pi2_scratch += 8; 593 } 594 } /* If all the rows are non-zero : Rishab */ 595 else 596 { 597 /* eeo */ 598 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 599 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 600 601 { 602 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 603 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 604 605 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's 606 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's 607 608 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 609 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 610 611 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 612 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 613 } 614 615 /* eee */ 616 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 617 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 618 { 619 /* Loading coeff and src for use in next block */ 620 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64 621 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64 622 623 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's 624 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's 625 626 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 627 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4); 628 629 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 630 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); 631 632 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 633 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 634 635 } 636 /* eo0[0-3] */ 637 { 638 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 639 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 640 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 641 m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77); 642 643 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 644 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); 645 646 647 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 648 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); 649 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); 650 651 /* e[0][0-3] stored in pi2_tmp[0][0-7] */ 652 /* e[7][0-3] stored in pi2_tmp[0][8-15] */ 653 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 654 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 655 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 656 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 657 658 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 659 pi2_scratch += 8; 660 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 661 pi2_scratch += 8; 662 663 664 } 665 666 /* eo0[4-7] */ 667 { 668 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 669 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); 670 671 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 672 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); 673 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); 674 675 /* e[0][4-7] stored in pi2_tmp[1][0-7] */ 676 /* e[7][4-7] stored in pi2_tmp[1][8-15] */ 677 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 678 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 679 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 680 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 681 682 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 683 pi2_scratch += 8; 684 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 685 pi2_scratch += 8; 686 687 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 688 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 689 690 } 691 692 /* eo1[0-3] */ 693 { 694 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 695 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); 696 697 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 698 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); 699 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); 700 701 /* e[1][0-3] stored in pi2_tmp[2][0-7] */ 702 /* e[6][0-3] stored in pi2_tmp[2][8-15] */ 703 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 704 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 705 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32); 706 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32); 707 708 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 709 pi2_scratch += 8; 710 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 711 pi2_scratch += 8; 712 713 } 714 715 /* eo1[4-7] */ 716 { 717 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 718 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 719 720 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 721 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); 722 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); 723 724 /* e[1][4-7] stored in pi2_tmp[3][0-7] */ 725 /* e[6][4-7] stored in pi2_tmp[3][8-15] */ 726 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); 727 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); 728 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33); 729 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33); 730 731 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 732 pi2_scratch += 8; 733 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 734 pi2_scratch += 8; 735 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 736 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75 737 } 738 739 /* eo2[0-3] */ 740 { 741 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 742 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); 743 744 /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 745 /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 746 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 747 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 748 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 749 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 750 751 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 752 pi2_scratch += 8; 753 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 754 pi2_scratch += 8; 755 } 756 757 /* eo2[4-7] */ 758 { 759 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 760 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); 761 762 /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 763 /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 764 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); 765 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); 766 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 767 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 768 769 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 770 pi2_scratch += 8; 771 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 772 pi2_scratch += 8; 773 774 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 775 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 776 777 } 778 779 /* eo3[0-3] */ 780 { 781 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 782 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); 783 784 /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 785 /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 786 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 787 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 788 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 789 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 790 791 792 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 793 pi2_scratch += 8; 794 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 795 pi2_scratch += 8; 796 } 797 798 /* eo3[4-7] */ 799 { 800 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 801 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 802 803 /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 804 /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 805 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); 806 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); 807 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 808 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 809 810 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 811 pi2_scratch += 8; 812 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 813 pi2_scratch += 8; 814 } 815 816 } 817 } 818 819 { 820 WORD32 sample_half_index = i << 3; 821 WORD16 *pi2_tmp_src = pi2_src + sample_half_index + src_strd; 822 823 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src); 824 pi2_tmp_src += (src_strd << 1); 825 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src); 826 pi2_tmp_src += (src_strd << 1); 827 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src); 828 pi2_tmp_src += (src_strd << 1); 829 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src); 830 pi2_tmp_src += (src_strd << 1); 831 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src); 832 pi2_tmp_src += (src_strd << 1); 833 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src); 834 pi2_tmp_src += (src_strd << 1); 835 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src); 836 pi2_tmp_src += (src_strd << 1); 837 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src); 838 pi2_tmp_src += (src_strd << 1); 839 } 840 841 /* o & stage 1 out */ 842 { 843 WORD32 j; 844 WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; 845 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; 846 WORD32 out_stride = (trans_size << 1); 847 WORD32 in_stride = trans_size << 1; 848 849 if(zero_last12_rows_stg1) 850 { 851 for(j = 0; j < 2; j++) 852 { 853 if(j) //H8B= higher 8 bytes L8B lower 8 bytes 854 { 855 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 856 } 857 else 858 { 859 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 860 } 861 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 862 863 864 /* o0[0-3] */ 865 { 866 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 867 868 869 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 870 pi2_src_scratch += in_stride; 871 872 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 873 874 875 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 876 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 877 878 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 879 m_count = _mm_cvtsi32_si128(i4_shift); 880 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); 881 882 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 883 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 884 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 885 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 886 887 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 888 889 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 890 pi2_dst_scratch += out_stride; 891 } 892 893 /* o1[0-3] */ 894 { 895 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 896 897 898 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 899 pi2_src_scratch += in_stride; 900 901 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 902 903 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 904 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 905 906 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 907 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 908 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 909 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 910 911 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 912 913 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 914 pi2_dst_scratch += out_stride; 915 } 916 917 /* o2[0-3] */ 918 { 919 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 920 921 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 922 pi2_src_scratch += in_stride; 923 924 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 925 926 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 927 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 928 929 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 930 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 931 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 932 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 933 934 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 935 936 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 937 pi2_dst_scratch += out_stride; 938 } 939 940 /* o3[0-3] */ 941 { 942 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 943 944 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 945 pi2_src_scratch += 8; 946 947 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 948 949 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 950 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 951 952 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 953 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 954 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 955 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 956 957 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 958 959 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 960 pi2_dst_scratch += 8; 961 } 962 963 /* o4[0-3] */ 964 { 965 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 966 967 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 968 pi2_src_scratch -= in_stride; 969 970 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 971 972 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 973 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 974 975 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 976 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 977 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 978 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 979 980 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 981 982 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 983 pi2_dst_scratch -= out_stride; 984 } 985 986 /* o5[0-3] */ 987 { 988 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 989 990 991 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 992 pi2_src_scratch -= in_stride; 993 994 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 995 996 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 997 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 998 999 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1000 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1001 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1002 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1003 1004 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1005 1006 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1007 pi2_dst_scratch -= out_stride; 1008 } 1009 1010 /* o6[0-3] */ 1011 { 1012 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1013 1014 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1015 pi2_src_scratch -= in_stride; 1016 1017 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 1018 1019 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1020 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1021 1022 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1023 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1024 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1025 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1026 1027 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1028 1029 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1030 pi2_dst_scratch -= out_stride; 1031 } 1032 1033 /* o7[0-3] */ 1034 { 1035 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1036 1037 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1038 pi2_src_scratch += 8; 1039 1040 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1041 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1042 1043 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1044 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1045 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1046 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1047 1048 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1049 1050 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1051 pi2_dst_scratch += 8; 1052 } 1053 } 1054 } 1055 else if(zero_last8_rows_stg1) 1056 { 1057 for(j = 0; j < 2; j++) 1058 { 1059 if(j) 1060 { 1061 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 1062 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B 1063 } 1064 else 1065 { 1066 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 1067 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B 1068 } 1069 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 1070 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 1071 1072 /* o0[0-3] */ 1073 { 1074 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1075 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1076 1077 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1078 pi2_src_scratch += in_stride; 1079 1080 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 1081 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 1082 1083 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 1084 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1085 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1086 1087 1088 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1089 m_count = _mm_cvtsi32_si128(i4_shift); 1090 1091 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); 1092 1093 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1094 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1095 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1096 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1097 1098 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1099 1100 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1101 pi2_dst_scratch += out_stride; 1102 } 1103 1104 /* o1[0-3] */ 1105 { 1106 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1107 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 1108 1109 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1110 pi2_src_scratch += in_stride; 1111 1112 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 1113 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 1114 1115 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 1116 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1117 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1118 1119 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1120 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1121 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1122 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1123 1124 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1125 1126 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1127 pi2_dst_scratch += out_stride; 1128 } 1129 1130 /* o2[0-3] */ 1131 { 1132 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1133 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1134 1135 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1136 pi2_src_scratch += in_stride; 1137 1138 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 1139 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 1140 1141 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 1142 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1143 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1144 1145 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1146 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1147 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1148 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1149 1150 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1151 1152 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1153 pi2_dst_scratch += out_stride; 1154 } 1155 1156 /* o3[0-3] */ 1157 { 1158 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1159 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 1160 1161 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1162 pi2_src_scratch += 8; 1163 1164 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 1165 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 1166 1167 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); 1168 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1169 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1170 1171 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1172 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1173 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1174 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1175 1176 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1177 1178 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1179 pi2_dst_scratch += 8; 1180 } 1181 1182 /* o4[0-3] */ 1183 { 1184 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1185 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1186 1187 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1188 pi2_src_scratch -= in_stride; 1189 1190 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 1191 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 1192 1193 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 1194 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1195 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1196 1197 1198 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1199 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1200 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1201 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1202 1203 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1204 1205 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1206 pi2_dst_scratch -= out_stride; 1207 } 1208 1209 /* o5[0-3] */ 1210 { 1211 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1212 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 1213 1214 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1215 pi2_src_scratch -= in_stride; 1216 1217 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 1218 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 1219 1220 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 1221 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1222 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1223 1224 1225 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1226 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1227 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1228 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1229 1230 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1231 1232 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1233 pi2_dst_scratch -= out_stride; 1234 } 1235 1236 /* o6[0-3] */ 1237 { 1238 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1239 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1240 1241 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1242 pi2_src_scratch -= in_stride; 1243 1244 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 1245 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 1246 1247 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 1248 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1249 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1250 1251 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1252 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1253 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1254 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1255 1256 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1257 1258 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1259 pi2_dst_scratch -= out_stride; 1260 } 1261 1262 /* o7[0-3] */ 1263 { 1264 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1265 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 1266 1267 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1268 pi2_src_scratch += 8; 1269 1270 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 1271 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1272 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1273 1274 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1275 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1276 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1277 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1278 1279 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1280 1281 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1282 pi2_dst_scratch += 8; 1283 } 1284 } 1285 1286 } 1287 else 1288 { 1289 1290 1291 1292 for(j = 0; j < 2; j++) 1293 { 1294 if(j) //H8B= higher 8 bytes L8B lower 8 bytes 1295 { 1296 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 1297 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B 1298 m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B 1299 m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B 1300 } 1301 else 1302 { 1303 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 1304 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B 1305 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B 1306 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B 1307 } 1308 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 1309 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 1310 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43 1311 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9 1312 1313 1314 /* o0[0-3] */ 1315 { 1316 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1317 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1318 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1319 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1320 1321 1322 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1323 pi2_src_scratch += in_stride; 1324 1325 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 1326 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 1327 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90 1328 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25 1329 1330 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 1331 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 1332 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 1333 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1334 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1335 1336 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1337 m_count = _mm_cvtsi32_si128(i4_shift); 1338 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 1339 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 1340 1341 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1342 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1343 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1344 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1345 1346 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1347 1348 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1349 pi2_dst_scratch += out_stride; 1350 } 1351 1352 /* o1[0-3] */ 1353 { 1354 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1355 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 1356 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 1357 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 1358 1359 1360 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1361 pi2_src_scratch += in_stride; 1362 1363 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 1364 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 1365 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57 1366 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43 1367 1368 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 1369 m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); 1370 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); 1371 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1372 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1373 1374 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1375 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1376 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1377 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1378 1379 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1380 1381 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1382 pi2_dst_scratch += out_stride; 1383 } 1384 1385 /* o2[0-3] */ 1386 { 1387 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1388 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1389 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1390 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1391 1392 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1393 pi2_src_scratch += in_stride; 1394 1395 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 1396 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 1397 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25 1398 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57 1399 1400 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 1401 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 1402 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); 1403 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1404 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1405 1406 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1407 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1408 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1409 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1410 1411 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1412 1413 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1414 pi2_dst_scratch += out_stride; 1415 } 1416 1417 /* o3[0-3] */ 1418 { 1419 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1420 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 1421 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 1422 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 1423 1424 1425 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1426 pi2_src_scratch += 8; 1427 1428 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 1429 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 1430 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87 1431 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70 1432 1433 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); 1434 m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); 1435 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); 1436 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1437 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1438 1439 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1440 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1441 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1442 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1443 1444 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1445 1446 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1447 pi2_dst_scratch += 8; 1448 } 1449 1450 /* o4[0-3] */ 1451 { 1452 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1453 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1454 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1455 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1456 1457 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1458 pi2_src_scratch -= in_stride; 1459 1460 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 1461 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 1462 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70 1463 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80 1464 1465 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 1466 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 1467 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); 1468 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1469 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1470 1471 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1472 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1473 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1474 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1475 1476 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1477 1478 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1479 pi2_dst_scratch -= out_stride; 1480 } 1481 1482 /* o5[0-3] */ 1483 { 1484 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1485 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 1486 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 1487 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 1488 1489 1490 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1491 pi2_src_scratch -= in_stride; 1492 1493 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 1494 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 1495 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9 1496 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87 1497 1498 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 1499 m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); 1500 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); 1501 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1502 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1503 1504 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1505 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1506 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1507 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1508 1509 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1510 1511 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1512 pi2_dst_scratch -= out_stride; 1513 } 1514 1515 /* o6[0-3] */ 1516 { 1517 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1518 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1519 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1520 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1521 1522 1523 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1524 pi2_src_scratch -= in_stride; 1525 1526 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 1527 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 1528 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80 1529 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90 1530 1531 1532 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 1533 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 1534 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 1535 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1536 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1537 1538 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1539 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1540 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1541 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1542 1543 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1544 1545 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1546 pi2_dst_scratch -= out_stride; 1547 } 1548 1549 /* o7[0-3] */ 1550 { 1551 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1552 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 1553 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 1554 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 1555 1556 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1557 pi2_src_scratch += 8; 1558 1559 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 1560 m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); 1561 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); 1562 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1563 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1564 1565 1566 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1567 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1568 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1569 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1570 1571 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1572 1573 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1574 pi2_dst_scratch += 8; 1575 } 1576 } 1577 } 1578 } 1579 1580 /* Transpose */ 1581 { 1582 WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; 1583 WORD16 *pi2_dst_scratch = ((i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp); 1584 WORD32 out_stride = (trans_size << 1); 1585 WORD32 in_stride = (trans_size << 1); 1586 WORD32 j; 1587 1588 for(j = 0; j < 2; j++) 1589 { 1590 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a 1591 pi2_src_scratch += in_stride; 1592 m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c 1593 pi2_src_scratch += in_stride; 1594 m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e 1595 pi2_src_scratch += in_stride; 1596 m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g 1597 pi2_src_scratch += 8; 1598 m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i 1599 pi2_src_scratch -= in_stride; 1600 m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k 1601 pi2_src_scratch -= in_stride; 1602 m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m 1603 pi2_src_scratch -= in_stride; 1604 m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o 1605 pi2_src_scratch += 8; 1606 1607 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0 1608 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0 1609 1610 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0 1611 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0 1612 1613 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0 1614 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0 1615 1616 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0 1617 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0 1618 1619 1620 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0 1621 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2 1622 1623 m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0 1624 m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2 1625 1626 m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0 1627 m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2 1628 1629 m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0 1630 m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2 1631 1632 1633 m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0 1634 m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1 1635 1636 m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2 1637 m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3 1638 1639 m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0 1640 m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp1 1641 1642 m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp2 1643 m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp3 1644 1645 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1646 pi2_dst_scratch += out_stride; 1647 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_44); 1648 pi2_dst_scratch += out_stride; 1649 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_41); 1650 pi2_dst_scratch += out_stride; 1651 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_45); 1652 pi2_dst_scratch += 8; 1653 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_42); 1654 pi2_dst_scratch -= out_stride; 1655 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_46); 1656 pi2_dst_scratch -= out_stride; 1657 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_43); 1658 pi2_dst_scratch -= out_stride; 1659 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_47); 1660 pi2_dst_scratch += 8; 1661 } 1662 } 1663 } 1664 1665 if(zero_last8_cols_stg1) 1666 { 1667 WORD16 *pi2_dst_scratch = (pi2_tmp + 8 * trans_size); 1668 WORD32 out_stride = (trans_size << 1); 1669 WORD32 j; 1670 1671 m_temp_reg_40 = _mm_setzero_si128(); 1672 for(j = 0; j < 2; j++) 1673 { 1674 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1675 pi2_dst_scratch += out_stride; 1676 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1677 pi2_dst_scratch += out_stride; 1678 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1679 pi2_dst_scratch += out_stride; 1680 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1681 pi2_dst_scratch += 8; 1682 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1683 pi2_dst_scratch -= out_stride; 1684 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1685 pi2_dst_scratch -= out_stride; 1686 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1687 pi2_dst_scratch -= out_stride; 1688 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1689 pi2_dst_scratch += 8; 1690 } 1691 } 1692 1693 1694 1695 1696 /* Stage 2 */ 1697 for(i = 0; i < 2; i++) 1698 { 1699 WORD16 *pi2_src_temp = (i) ? (pi2_tmp + 2 * trans_size) : (WORD16 *)(pi2_tmp); 1700 WORD32 stride = (trans_size); 1701 MEM_ALIGN16 WORD16 temp_array[256]; 1702 1703 i4_shift = IT_SHIFT_STAGE_2; 1704 1705 if(zero_last12_rows_stg2) 1706 { 1707 /* eeo */ 1708 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 1709 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 1710 { 1711 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0 1712 1713 pi2_src_temp += (stride * 9); 1714 1715 if(!i) 1716 { 1717 pi2_src_temp += (stride * 6 + 8); 1718 } 1719 else 1720 { 1721 pi2_src_temp += (stride * 2 + 8); 1722 } 1723 1724 pi2_src_temp -= (stride * 9); 1725 1726 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2 1727 1728 m_temp_reg_20 = _mm_setzero_si128(); 1729 m_temp_reg_22 = _mm_setzero_si128(); 1730 1731 m_temp_reg_21 = _mm_setzero_si128(); 1732 m_temp_reg_23 = _mm_setzero_si128(); 1733 } 1734 1735 /* eee */ 1736 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 1737 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 1738 { 1739 /* Loading coeff and src for use in next block */ 1740 1741 /* Loading coeff and src for use in next block */ 1742 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_20, m_temp_reg_70); 1743 1744 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 1745 1746 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); 1747 1748 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); 1749 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); 1750 1751 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 1752 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 1753 1754 m_temp_reg_26 = m_temp_reg_24; 1755 m_temp_reg_27 = m_temp_reg_25; 1756 /* */ 1757 1758 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_20); 1759 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_20); 1760 } 1761 1762 /* eo */ 1763 { 1764 WORD16 *pi2_scratch = temp_array; 1765 WORD32 out_stride = 8; 1766 1767 1768 /* eo0[0-3] */ 1769 { 1770 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1771 1772 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 1773 1774 /* e[0][0-3] stored in pu1_dst[0] */ 1775 /* e[7][0-3] stored in pu1_dst[1] */ 1776 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); 1777 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); 1778 1779 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1780 pi2_scratch += out_stride; 1781 _mm_store_si128((__m128i *)(pi2_scratch), m_temp_reg_35); 1782 pi2_scratch += out_stride; 1783 } 1784 1785 /* eo0[4-7] */ 1786 { 1787 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1788 1789 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 1790 1791 /* e[0][4-7] stored in pu1_dst[2] */ 1792 /* e[7][4-7] stored in pu1_dst[3] */ 1793 1794 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); 1795 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); 1796 1797 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1798 pi2_scratch += out_stride; 1799 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1800 pi2_scratch += out_stride; 1801 1802 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 1803 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 1804 1805 } 1806 1807 /* eo1[0-3] */ 1808 { 1809 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 1810 1811 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 1812 1813 /* e[1][0-3] stored in pu1_dst[4] */ 1814 /* e[6][0-3] stored in pu1_dst[5] */ 1815 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); 1816 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); 1817 1818 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1819 pi2_scratch += out_stride; 1820 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1821 pi2_scratch += out_stride; 1822 } 1823 1824 /* eo1[4-7] */ 1825 { 1826 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 1827 1828 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 1829 1830 /* e[1][4-7] stored in pu1_dst[6]*/ 1831 /* e[6][4-7] stored in pu1_dst[7] */ 1832 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); 1833 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); 1834 1835 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1836 pi2_scratch += out_stride; 1837 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1838 pi2_scratch += out_stride; 1839 1840 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 1841 1842 } 1843 1844 /* eo2[0-3] */ 1845 { 1846 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1847 1848 /* e[2][0-3] stored in pu1_dst[8]*/ 1849 /* e[5][0-3] stored in pu1_dst[9] */ 1850 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); 1851 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); 1852 1853 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1854 pi2_scratch += out_stride; 1855 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1856 pi2_scratch += out_stride; 1857 } 1858 1859 /* eo2[4-7] */ 1860 { 1861 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1862 1863 /* e[2][4-7] stored in pu1_dst[10]*/ 1864 /* e[5][4-7] stored in pu1_dst[11] */ 1865 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); 1866 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); 1867 1868 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1869 pi2_scratch += out_stride; 1870 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1871 pi2_scratch += out_stride; 1872 1873 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 1874 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 1875 1876 } 1877 1878 /* eo3[0-3] */ 1879 { 1880 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 1881 1882 /* e[3][0-3] stored in pu1_dst[12]*/ 1883 /* e[4][0-3] stored in pu1_dst[13] */ 1884 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); 1885 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); 1886 1887 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1888 pi2_scratch += out_stride; 1889 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1890 pi2_scratch += out_stride; 1891 } 1892 1893 /* eo3[4-7] */ 1894 { 1895 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 1896 1897 /* e[3][4-7] stored in pu1_dst[14]*/ 1898 /* e[4][4-7] stored in pu1_dst[15] */ 1899 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); 1900 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); 1901 1902 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1903 pi2_scratch += out_stride; 1904 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1905 pi2_scratch += out_stride; 1906 } 1907 1908 } 1909 } 1910 else if(zero_last8_rows_stg2) 1911 { 1912 /* eeo */ 1913 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 1914 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 1915 { 1916 1917 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[3][0]); //83 1918 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[4][0]); //36 1919 1920 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0 1921 pi2_src_temp += (stride); 1922 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4 1923 pi2_src_temp += (stride * 8); 1924 1925 if(!i) 1926 { 1927 pi2_src_temp += (stride * 6 + 8); 1928 } 1929 else 1930 { 1931 pi2_src_temp += (stride * 2 + 8); 1932 } 1933 1934 pi2_src_temp -= (stride * 8); 1935 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6 1936 pi2_src_temp -= (stride); 1937 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2 1938 1939 1940 m_temp_reg_76 = _mm_setzero_si128(); 1941 1942 1943 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 1944 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 1945 1946 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's 1947 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's 1948 1949 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1950 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 1951 1952 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1953 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 1954 } 1955 1956 /* eee */ 1957 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 1958 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 1959 { 1960 /* Loading coeff and src for use in next block */ 1961 1962 1963 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_76, m_temp_reg_70); 1964 1965 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 1966 1967 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); 1968 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); 1969 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); 1970 1971 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 1972 1973 m_temp_reg_26 = m_temp_reg_24; 1974 m_temp_reg_27 = m_temp_reg_25; 1975 1976 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 1977 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 1978 } 1979 1980 /* eo */ 1981 { 1982 WORD16 *pi2_scratch = temp_array; 1983 WORD32 out_stride = 8; 1984 1985 1986 /* eo0[0-3] */ 1987 { 1988 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1989 1990 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 1991 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); 1992 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); 1993 1994 /* e[0][0-3] stored in pu1_dst[0] */ 1995 /* e[7][0-3] stored in pu1_dst[1] */ 1996 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 1997 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 1998 1999 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2000 pi2_scratch += out_stride; 2001 _mm_store_si128((__m128i *)(pi2_scratch), m_temp_reg_35); 2002 pi2_scratch += out_stride; 2003 } 2004 2005 /* eo0[4-7] */ 2006 { 2007 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 2008 2009 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 2010 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); 2011 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); 2012 2013 /* e[0][4-7] stored in pu1_dst[2] */ 2014 /* e[7][4-7] stored in pu1_dst[3] */ 2015 2016 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 2017 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 2018 2019 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2020 pi2_scratch += out_stride; 2021 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2022 pi2_scratch += out_stride; 2023 2024 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 2025 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 2026 2027 } 2028 2029 /* eo1[0-3] */ 2030 { 2031 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 2032 2033 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 2034 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); 2035 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); 2036 2037 /* e[1][0-3] stored in pu1_dst[4] */ 2038 /* e[6][0-3] stored in pu1_dst[5] */ 2039 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 2040 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 2041 2042 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2043 pi2_scratch += out_stride; 2044 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2045 pi2_scratch += out_stride; 2046 } 2047 2048 /* eo1[4-7] */ 2049 { 2050 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 2051 2052 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 2053 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); 2054 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); 2055 2056 /* e[1][4-7] stored in pu1_dst[6]*/ 2057 /* e[6][4-7] stored in pu1_dst[7] */ 2058 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); 2059 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); 2060 2061 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2062 pi2_scratch += out_stride; 2063 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2064 pi2_scratch += out_stride; 2065 2066 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 2067 2068 } 2069 2070 /* eo2[0-3] */ 2071 { 2072 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2073 2074 /* e[2][0-3] stored in pu1_dst[8]*/ 2075 /* e[5][0-3] stored in pu1_dst[9] */ 2076 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 2077 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 2078 2079 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2080 pi2_scratch += out_stride; 2081 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2082 pi2_scratch += out_stride; 2083 } 2084 2085 /* eo2[4-7] */ 2086 { 2087 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 2088 2089 /* e[2][4-7] stored in pu1_dst[10]*/ 2090 /* e[5][4-7] stored in pu1_dst[11] */ 2091 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); 2092 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); 2093 2094 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2095 pi2_scratch += out_stride; 2096 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2097 pi2_scratch += out_stride; 2098 2099 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 2100 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 2101 2102 } 2103 2104 /* eo3[0-3] */ 2105 { 2106 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 2107 2108 /* e[3][0-3] stored in pu1_dst[12]*/ 2109 /* e[4][0-3] stored in pu1_dst[13] */ 2110 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 2111 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 2112 2113 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2114 pi2_scratch += out_stride; 2115 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2116 pi2_scratch += out_stride; 2117 } 2118 2119 /* eo3[4-7] */ 2120 { 2121 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 2122 2123 /* e[3][4-7] stored in pu1_dst[14]*/ 2124 /* e[4][4-7] stored in pu1_dst[15] */ 2125 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); 2126 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); 2127 2128 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2129 pi2_scratch += out_stride; 2130 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2131 pi2_scratch += out_stride; 2132 } 2133 } 2134 } 2135 2136 else 2137 { 2138 /* eeo */ 2139 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 2140 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 2141 { 2142 2143 2144 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0 2145 pi2_src_temp += (stride); 2146 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4 2147 pi2_src_temp += (stride * 7); 2148 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //8 2149 pi2_src_temp += (stride); 2150 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //12 2151 if(!i) 2152 { 2153 pi2_src_temp += (stride * 6 + 8); 2154 } 2155 else 2156 { 2157 pi2_src_temp += (stride * 2 + 8); 2158 } 2159 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //14 2160 pi2_src_temp -= (stride); 2161 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //10 2162 pi2_src_temp -= (stride * 7); 2163 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6 2164 pi2_src_temp -= (stride); 2165 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2 2166 2167 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 2168 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 2169 2170 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's 2171 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's 2172 2173 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 2174 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 2175 2176 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 2177 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 2178 2179 2180 } 2181 2182 /* eee */ 2183 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 2184 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 2185 { 2186 /* Loading coeff and src for use in next block */ 2187 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64 2188 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64 2189 2190 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's 2191 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's 2192 2193 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 2194 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4); 2195 2196 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 2197 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); 2198 2199 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 2200 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 2201 2202 } 2203 2204 /* eo */ 2205 { 2206 WORD16 *pi2_scratch = temp_array; 2207 WORD32 out_stride = 8; 2208 2209 2210 2211 /* eo0[0-3] */ 2212 { 2213 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 2214 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 2215 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 2216 m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77); 2217 2218 2219 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2220 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); 2221 2222 2223 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 2224 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); 2225 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); 2226 2227 2228 /* e[0][0-3] stored in pi2_tmp[0][0-7] */ 2229 /* e[7][0-3] stored in pi2_tmp[0][8-15] */ 2230 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 2231 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 2232 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 2233 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 2234 2235 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2236 pi2_scratch += out_stride; 2237 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2238 pi2_scratch += out_stride; 2239 2240 2241 } 2242 2243 /* eo0[4-7] */ 2244 { 2245 2246 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 2247 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); 2248 2249 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 2250 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); 2251 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); 2252 2253 /* e[0][4-7] stored in pi2_tmp[1][0-7] */ 2254 /* e[7][4-7] stored in pi2_tmp[1][8-15] */ 2255 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 2256 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 2257 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 2258 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 2259 2260 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2261 pi2_scratch += out_stride; 2262 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2263 pi2_scratch += out_stride; 2264 2265 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 2266 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 2267 2268 } 2269 2270 /* eo1[0-3] */ 2271 { 2272 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 2273 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); 2274 2275 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 2276 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); 2277 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); 2278 2279 /* e[1][0-3] stored in pi2_tmp[2][0-7] */ 2280 /* e[6][0-3] stored in pi2_tmp[2][8-15] */ 2281 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 2282 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 2283 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32); 2284 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32); 2285 2286 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2287 pi2_scratch += out_stride; 2288 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2289 pi2_scratch += out_stride; 2290 2291 } 2292 2293 /* eo1[4-7] */ 2294 { 2295 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 2296 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 2297 2298 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 2299 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); 2300 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); 2301 2302 /* e[1][4-7] stored in pi2_tmp[3][0-7] */ 2303 /* e[6][4-7] stored in pi2_tmp[3][8-15] */ 2304 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); 2305 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); 2306 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33); 2307 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33); 2308 2309 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2310 pi2_scratch += out_stride; 2311 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2312 pi2_scratch += out_stride; 2313 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 2314 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75 2315 } 2316 2317 /* eo2[0-3] */ 2318 { 2319 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2320 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); 2321 2322 /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 2323 /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 2324 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 2325 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 2326 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 2327 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 2328 2329 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2330 pi2_scratch += out_stride; 2331 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2332 pi2_scratch += out_stride; 2333 } 2334 2335 /* eo2[4-7] */ 2336 { 2337 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 2338 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); 2339 2340 /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 2341 /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 2342 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); 2343 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); 2344 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 2345 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 2346 2347 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2348 pi2_scratch += out_stride; 2349 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2350 pi2_scratch += out_stride; 2351 2352 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 2353 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 2354 2355 } 2356 2357 /* eo3[0-3] */ 2358 { 2359 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 2360 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); 2361 2362 /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 2363 /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 2364 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 2365 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 2366 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 2367 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 2368 2369 2370 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2371 pi2_scratch += out_stride; 2372 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2373 pi2_scratch += out_stride; 2374 } 2375 2376 /* eo3[4-7] */ 2377 { 2378 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 2379 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 2380 2381 /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 2382 /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 2383 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); 2384 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); 2385 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 2386 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 2387 2388 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2389 pi2_scratch += out_stride; 2390 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2391 pi2_scratch += out_stride; 2392 } 2393 } 2394 } 2395 2396 if(zero_last12_rows_stg2) 2397 { 2398 /* o & stage 2 pre-transposed out */ 2399 { 2400 WORD32 j; 2401 WORD16 *pi2_src_scratch = temp_array; 2402 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp); 2403 WORD32 out_stride = (trans_size); 2404 WORD32 in_stride = (8) * 4; 2405 2406 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2); 2407 2408 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1 2409 2410 pi2_src_temp += (stride * 9); 2411 2412 if(0 == i) 2413 { 2414 pi2_src_temp -= (stride * 2 - 8); 2415 } 2416 else 2417 { 2418 pi2_src_temp -= (stride * 6 - 8); 2419 } 2420 pi2_src_temp -= (stride * 9); 2421 2422 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3 2423 2424 2425 for(j = 0; j < 2; j++) 2426 { 2427 if(j) 2428 { 2429 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 2430 } 2431 else 2432 { 2433 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 2434 } 2435 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 2436 2437 /* o0[0-3] */ 2438 { 2439 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2440 2441 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2442 pi2_src_scratch += in_stride; 2443 2444 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 2445 2446 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2447 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2448 2449 2450 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2451 m_count = _mm_cvtsi32_si128(i4_shift); 2452 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); 2453 2454 2455 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2456 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2457 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2458 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2459 2460 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2461 2462 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2463 pi2_dst_scratch += out_stride; 2464 } 2465 2466 /* o1[0-3] */ 2467 { 2468 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2469 2470 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2471 pi2_src_scratch += in_stride; 2472 2473 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 2474 2475 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 2476 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 2477 2478 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2479 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2480 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2481 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2482 2483 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2484 2485 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2486 pi2_dst_scratch += ((!i) * out_stride + 8); 2487 } 2488 2489 /* o2[0-3] */ 2490 { 2491 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2492 2493 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2494 pi2_src_scratch += in_stride; 2495 2496 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 2497 2498 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2499 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2500 2501 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2502 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2503 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2504 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2505 2506 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2507 2508 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2509 pi2_dst_scratch += out_stride; 2510 } 2511 2512 /* o3[0-3] */ 2513 { 2514 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2515 2516 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2517 pi2_src_scratch += 8; 2518 2519 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 2520 2521 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 2522 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 2523 2524 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2525 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2526 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2527 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2528 2529 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2530 2531 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2532 pi2_dst_scratch += (i * out_stride + 8); 2533 } 2534 2535 /* o4[0-3] */ 2536 { 2537 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2538 2539 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2540 pi2_src_scratch -= in_stride; 2541 2542 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 2543 2544 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2545 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2546 2547 2548 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2549 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2550 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2551 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2552 2553 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2554 2555 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2556 pi2_dst_scratch += out_stride; 2557 } 2558 2559 /* o5[0-3] */ 2560 { 2561 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2562 2563 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2564 pi2_src_scratch -= in_stride; 2565 2566 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 2567 2568 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 2569 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 2570 2571 2572 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2573 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2574 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2575 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2576 2577 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2578 2579 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2580 pi2_dst_scratch += ((!i) * out_stride + 8); 2581 } 2582 2583 /* o6[0-3] */ 2584 { 2585 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2586 2587 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2588 pi2_src_scratch -= in_stride; 2589 2590 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 2591 2592 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2593 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2594 2595 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2596 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2597 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2598 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2599 2600 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2601 2602 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2603 pi2_dst_scratch += out_stride; 2604 } 2605 2606 /* o7[0-3] */ 2607 { 2608 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2609 2610 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2611 pi2_src_scratch += 8; 2612 2613 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 2614 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 2615 2616 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2617 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2618 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2619 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2620 2621 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2622 2623 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2624 pi2_dst_scratch += (i * out_stride + 8); 2625 } 2626 2627 2628 } 2629 } 2630 } 2631 else if(zero_last8_rows_stg2) 2632 { 2633 /* o & stage 2 pre-transposed out */ 2634 { 2635 WORD32 j; 2636 WORD16 *pi2_src_scratch = temp_array; 2637 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp); 2638 WORD32 out_stride = (trans_size); 2639 WORD32 in_stride = (8) * 4; 2640 2641 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2); 2642 2643 2644 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1 2645 pi2_src_temp += (stride); 2646 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5 2647 pi2_src_temp += (stride * 8); 2648 2649 if(0 == i) 2650 { 2651 pi2_src_temp -= (stride * 2 - 8); 2652 } 2653 else 2654 { 2655 pi2_src_temp -= (stride * 6 - 8); 2656 } 2657 2658 pi2_src_temp -= (stride * 8); 2659 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7 2660 pi2_src_temp -= (stride); 2661 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3 2662 2663 2664 for(j = 0; j < 2; j++) 2665 { 2666 if(j) 2667 { 2668 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 2669 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B 2670 } 2671 else 2672 { 2673 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 2674 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B 2675 } 2676 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 2677 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 2678 2679 /* o0[0-3] */ 2680 { 2681 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2682 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2683 2684 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2685 pi2_src_scratch += in_stride; 2686 2687 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 2688 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 2689 2690 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2691 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2692 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2693 2694 2695 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2696 m_count = _mm_cvtsi32_si128(i4_shift); 2697 2698 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); 2699 2700 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2701 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2702 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2703 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2704 2705 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2706 2707 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2708 pi2_dst_scratch += out_stride; 2709 } 2710 2711 /* o1[0-3] */ 2712 { 2713 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2714 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 2715 2716 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2717 pi2_src_scratch += in_stride; 2718 2719 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 2720 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 2721 2722 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 2723 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 2724 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 2725 2726 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2727 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2728 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2729 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2730 2731 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2732 2733 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2734 pi2_dst_scratch += ((!i) * out_stride + 8); 2735 } 2736 2737 /* o2[0-3] */ 2738 { 2739 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2740 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2741 2742 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2743 pi2_src_scratch += in_stride; 2744 2745 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 2746 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 2747 2748 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 2749 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2750 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2751 2752 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2753 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2754 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2755 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2756 2757 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2758 2759 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2760 pi2_dst_scratch += out_stride; 2761 } 2762 2763 /* o3[0-3] */ 2764 { 2765 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2766 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 2767 2768 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2769 pi2_src_scratch += 8; 2770 2771 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 2772 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 2773 2774 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); 2775 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 2776 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 2777 2778 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2779 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2780 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2781 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2782 2783 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2784 2785 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2786 pi2_dst_scratch += (i * out_stride + 8); 2787 } 2788 2789 /* o4[0-3] */ 2790 { 2791 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2792 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2793 2794 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2795 pi2_src_scratch -= in_stride; 2796 2797 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 2798 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 2799 2800 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 2801 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2802 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2803 2804 2805 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2806 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2807 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2808 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2809 2810 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2811 2812 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2813 pi2_dst_scratch += out_stride; 2814 } 2815 2816 /* o5[0-3] */ 2817 { 2818 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2819 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 2820 2821 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2822 pi2_src_scratch -= in_stride; 2823 2824 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 2825 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 2826 2827 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 2828 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 2829 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 2830 2831 2832 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2833 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2834 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2835 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2836 2837 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2838 2839 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2840 pi2_dst_scratch += ((!i) * out_stride + 8); 2841 } 2842 2843 /* o6[0-3] */ 2844 { 2845 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2846 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2847 2848 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2849 pi2_src_scratch -= in_stride; 2850 2851 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 2852 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 2853 2854 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2855 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2856 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2857 2858 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2859 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2860 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2861 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2862 2863 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2864 2865 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2866 pi2_dst_scratch += out_stride; 2867 } 2868 2869 /* o7[0-3] */ 2870 { 2871 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2872 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 2873 2874 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2875 pi2_src_scratch += 8; 2876 2877 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 2878 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 2879 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 2880 2881 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2882 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2883 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2884 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2885 2886 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2887 2888 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2889 pi2_dst_scratch += (i * out_stride + 8); 2890 } 2891 } 2892 } 2893 } 2894 else 2895 { 2896 /* o & stage 2 pre-transposed out */ 2897 { 2898 WORD32 j; 2899 WORD16 *pi2_src_scratch = temp_array; 2900 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp); 2901 WORD32 out_stride = (trans_size); 2902 WORD32 in_stride = (8) * 4; 2903 2904 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2); 2905 2906 2907 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1 2908 pi2_src_temp += (stride); 2909 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5 2910 pi2_src_temp += (stride * 7); 2911 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //9 2912 pi2_src_temp += (stride); 2913 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //13 2914 if(0 == i) 2915 { 2916 pi2_src_temp -= (stride * 2 - 8); 2917 } 2918 else 2919 { 2920 pi2_src_temp -= (stride * 6 - 8); 2921 } 2922 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //15 2923 pi2_src_temp -= (stride); 2924 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //11 2925 pi2_src_temp -= (stride * 7); 2926 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7 2927 pi2_src_temp -= (stride); 2928 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3 2929 2930 2931 for(j = 0; j < 2; j++) 2932 { 2933 2934 if(j) //H8B= higher 8 bytes L8B lower 8 bytes 2935 { 2936 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 2937 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B 2938 m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B 2939 m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B 2940 } 2941 else 2942 { 2943 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 2944 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B 2945 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B 2946 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B 2947 } 2948 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 2949 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 2950 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43 2951 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9 2952 2953 2954 /* o0[0-3] */ 2955 { 2956 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2957 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2958 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 2959 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 2960 2961 2962 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2963 pi2_src_scratch += in_stride; 2964 2965 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 2966 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 2967 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90 2968 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25 2969 2970 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2971 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 2972 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 2973 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2974 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2975 2976 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2977 m_count = _mm_cvtsi32_si128(i4_shift); 2978 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); 2979 2980 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2981 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2982 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2983 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2984 2985 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2986 2987 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2988 pi2_dst_scratch += out_stride; 2989 } 2990 2991 /* o1[0-3] */ 2992 { 2993 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2994 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 2995 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 2996 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 2997 2998 2999 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3000 pi2_src_scratch += in_stride; 3001 3002 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 3003 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 3004 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57 3005 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43 3006 3007 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 3008 m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); 3009 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); 3010 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 3011 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 3012 3013 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3014 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3015 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3016 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3017 3018 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3019 3020 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3021 pi2_dst_scratch += ((!i) * out_stride + 8); 3022 } 3023 3024 /* o2[0-3] */ 3025 { 3026 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3027 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3028 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3029 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3030 3031 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3032 pi2_src_scratch += in_stride; 3033 3034 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 3035 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 3036 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25 3037 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57 3038 3039 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 3040 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 3041 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); 3042 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3043 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3044 3045 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3046 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3047 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3048 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3049 3050 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3051 3052 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3053 pi2_dst_scratch += out_stride; 3054 } 3055 3056 /* o3[0-3] */ 3057 { 3058 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 3059 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 3060 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 3061 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 3062 3063 3064 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3065 pi2_src_scratch += 8; 3066 3067 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 3068 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 3069 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87 3070 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70 3071 3072 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); 3073 m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); 3074 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); 3075 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 3076 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 3077 3078 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3079 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3080 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3081 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3082 3083 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3084 3085 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3086 pi2_dst_scratch += (i * out_stride + 8); 3087 } 3088 3089 /* o4[0-3] */ 3090 { 3091 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3092 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3093 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3094 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3095 3096 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3097 pi2_src_scratch -= in_stride; 3098 3099 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 3100 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 3101 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70 3102 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80 3103 3104 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 3105 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 3106 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); 3107 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3108 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3109 3110 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3111 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3112 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3113 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3114 3115 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3116 3117 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3118 pi2_dst_scratch += out_stride; 3119 } 3120 3121 /* o5[0-3] */ 3122 { 3123 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 3124 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 3125 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 3126 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 3127 3128 3129 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3130 pi2_src_scratch -= in_stride; 3131 3132 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 3133 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 3134 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9 3135 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87 3136 3137 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 3138 m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); 3139 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); 3140 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 3141 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 3142 3143 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3144 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3145 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3146 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3147 3148 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3149 3150 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3151 pi2_dst_scratch += ((!i) * out_stride + 8); 3152 } 3153 3154 /* o6[0-3] */ 3155 { 3156 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3157 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3158 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3159 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3160 3161 3162 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3163 pi2_src_scratch -= in_stride; 3164 3165 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 3166 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 3167 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80 3168 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90 3169 3170 3171 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3172 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 3173 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3174 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3175 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3176 3177 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3178 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3179 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3180 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3181 3182 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3183 3184 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3185 pi2_dst_scratch += out_stride; 3186 } 3187 3188 /* o7[0-3] */ 3189 { 3190 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 3191 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 3192 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 3193 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 3194 3195 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3196 pi2_src_scratch += 8; 3197 3198 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 3199 m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); 3200 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); 3201 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 3202 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 3203 3204 3205 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3206 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3207 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3208 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3209 3210 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3211 3212 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3213 pi2_dst_scratch += (i * out_stride + 8); 3214 } 3215 3216 } 3217 } 3218 } 3219 } 3220 3221 /* Transpose */ 3222 { 3223 WORD16 *pi2_src_scratch; 3224 UWORD8 *pu1_pred_temp = pu1_pred; 3225 WORD32 out_stride = dst_strd; 3226 WORD32 in_stride = trans_size; 3227 WORD32 j; 3228 m_temp_reg_1 = _mm_setzero_si128(); 3229 for(i = 0; i < 2; i++) 3230 { 3231 pi2_src_scratch = (i) ? (pi2_tmp + 8) : pi2_tmp; 3232 3233 for(j = 0; j < 2; j++) 3234 { 3235 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a 3236 pi2_src_scratch += in_stride; 3237 m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c 3238 pi2_src_scratch += ((!i) * in_stride + 8); 3239 m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e 3240 pi2_src_scratch += (in_stride); 3241 m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g 3242 pi2_src_scratch += (i * in_stride + 8); 3243 m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i 3244 pi2_src_scratch += in_stride; 3245 m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k 3246 pi2_src_scratch += ((!i) * in_stride + 8); 3247 m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m 3248 pi2_src_scratch += in_stride; 3249 m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o 3250 pi2_src_scratch += (i * in_stride + 8); 3251 3252 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0 3253 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0 3254 3255 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0 3256 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0 3257 3258 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0 3259 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0 3260 3261 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0 3262 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0 3263 3264 3265 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0 3266 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2 3267 3268 m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0 3269 m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2 3270 3271 m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0 3272 m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2 3273 3274 m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0 3275 m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2 3276 3277 3278 m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0 3279 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp); 3280 3281 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1); 3282 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1); 3283 3284 m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0 3285 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_40, m_temp_reg_0); 3286 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_44, m_temp_reg_12); 3287 3288 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 3289 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 3290 pu1_dst += out_stride; 3291 pu1_pred_temp += pred_strd; 3292 3293 m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1 3294 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp); 3295 3296 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1); 3297 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1); 3298 3299 m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0 3300 m_temp_reg_41 = _mm_add_epi16(m_temp_reg_41, m_temp_reg_0); 3301 m_temp_reg_45 = _mm_add_epi16(m_temp_reg_45, m_temp_reg_12); 3302 3303 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_41, m_temp_reg_45); 3304 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 3305 pu1_dst += out_stride; 3306 pu1_pred_temp += pred_strd; 3307 3308 m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2 3309 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp); 3310 3311 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1); 3312 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1); 3313 3314 m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0 3315 m_temp_reg_42 = _mm_add_epi16(m_temp_reg_42, m_temp_reg_0); 3316 m_temp_reg_46 = _mm_add_epi16(m_temp_reg_46, m_temp_reg_12); 3317 3318 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_42, m_temp_reg_46); 3319 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 3320 pu1_dst += out_stride; 3321 pu1_pred_temp += pred_strd; 3322 3323 m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3 3324 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp); 3325 3326 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1); 3327 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1); 3328 3329 m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0 3330 m_temp_reg_43 = _mm_add_epi16(m_temp_reg_43, m_temp_reg_0); 3331 m_temp_reg_47 = _mm_add_epi16(m_temp_reg_47, m_temp_reg_12); 3332 3333 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_43, m_temp_reg_47); 3334 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 3335 pu1_dst += out_stride; 3336 pu1_pred_temp += pred_strd; 3337 } 3338 } 3339 } 3340 } 3341